Spaces:

pranavraj1103
/

ksp-vanguard_squad

Runtime error

App Files Files Community

pranavraj1103 commited on May 17, 2024

Commit

dae4805

1 Parent(s): d54a6ab

chore: Add Dockerfile and requirements.txt for containerization

Browse files

Files changed (6) hide show

.gitignore +160 -0
Dockerfile +13 -0
app.py +196 -0
note.txt +23 -0
requirements.txt +77 -0
run.py +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11-slim-bullseye
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN python -m spacy download en_core_web_lg
+COPY . .
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+import re
+import spacy
+import uvicorn
+import docx
+import requests
+import spacy
+from presidio_analyzer import RecognizerRegistry
+from presidio_analyzer.nlp_engine import (
+    NlpEngine,
+    NlpEngineProvider,
+)
+# import google.generativeai as genai
+from dotenv import load_dotenv
+from transformers import pipeline
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+from fastapi import FastAPI, Request, UploadFile, File
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+load_dotenv()
+app = FastAPI(root_path=os.environ.get("ROOT_PATH"))
+# genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
+# model = genai.GenerativeModel('gemini-pro')
+HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
+# pipe = pipeline("fill-mask", model="pranavraj1103/ksp-mask-model")
+def create_nlp_engine_with_spacy(
+    model_path: str = "en_core_web_sm",
+):
+    """
+    Instantiate an NlpEngine with a spaCy model
+    :param model_path: path to model / model name.
+    """
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": model_path}],
+        "ner_model_configuration": {
+            "model_to_presidio_entity_mapping": {
+                "PER": "PERSON",
+                "PERSON": "PERSON",
+                "NORP": "NRP",
+                "FAC": "FACILITY",
+                "LOC": "LOCATION",
+                "GPE": "LOCATION",
+                "LOCATION": "LOCATION",
+                "ORG": "ORGANIZATION",
+                "ORGANIZATION": "ORGANIZATION",
+                "DATE": "DATE_TIME",
+                "TIME": "DATE_TIME",
+            },
+            "low_confidence_score_multiplier": 0.4,
+            "low_score_entity_names": ["ORG", "ORGANIZATION"],
+        },
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+    return nlp_engine, registry
+nlp_engine, registry = create_nlp_engine_with_spacy()
+analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
+anonymizer = AnonymizerEngine()
+@app.get("/")
+async def read_root():
+    return {"message": "Hello World"}
+@app.get("/vocab_thresh_masking")
+async def vocab_thresh_masking(text, threshold):
+    ner_model = spacy.load("en_core_web_sm")
+    doc = ner_model(text)
+    word_counts = dict()
+    for token in doc:
+        word_counts[token.text] = word_counts.get(str(token.text), 0) + 1
+    threshold = int(threshold)
+    frequent_words = [word for word, count in word_counts.items() if count >= threshold]
+    masked_text = []
+    pii_locations = []  # List to store (start index, end index, type) tuples
+    for i, token in enumerate(doc):
+        if str(token.text) in frequent_words:
+            masked_text.append(str(token.text))
+        else:
+            masked_text.append("[MASK]")
+            # Potentially masked PII, record location and tentative type (UNKNOWN)
+            pii_locations.append((token.idx, token.idx + len(token.text), "UNKNOWN"))
+    return " ".join(masked_text), pii_locations
+@app.get("/entity_tagger_masking")
+async def entity_tagger_masking(text):
+    ner_model = spacy.load("en_core_web_sm")
+    doc = ner_model(text)
+    masked_text = []
+    pii_locations = []
+    for token in doc:
+        if token.ent_type_ == "PERSON":
+            masked_text.append("[MASK]")
+            pii_locations.append((token.idx, token.idx + len(token.text), "PERSON"))
+        elif token.ent_type_ == "LOC":
+            masked_text.append("[MASK]")
+            pii_locations.append((token.idx, token.idx + len(token.text), "LOCATION"))
+        elif token.ent_type_ == "ORG":
+            masked_text.append("[MASK]")
+            pii_locations.append((token.idx, token.idx + len(token.text), "ORGANIZATION"))
+        elif token.ent_type_ == "DATE":
+            masked_text.append("[MASK]")
+            pii_locations.append((token.idx, token.idx + len(token.text), "DATE"))
+        else:
+            masked_text.append(token.text)
+    return " ".join(masked_text), pii_locations
+@app.get("/email_and_phone")
+async def identify_email_and_phone(text):
+    # use regex to identify emails and phone numbers and mask them
+    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
+    phone_pattern = r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b"
+    # find the location of emails and phone numbers
+    pii_locations = []
+    for match in re.finditer(email_pattern, text):
+        pii_locations.append((match.start(), match.end(), "EMAIL"))
+    for match in re.finditer(phone_pattern, text):
+        pii_locations.append((match.start(), match.end(), "PHONE NUMBER"))
+    # mask the emails and phone numbers
+    text = re.sub(email_pattern, "[MASK]", text)
+    text = re.sub(phone_pattern, "[MASK]", text)
+    return text, pii_locations
+@app.get("/anonymize_masked_text")
+async def anonymize_masked_text(masked_text):
+    # prompt = f"The following text contains Personal Information Identifiers marked with [MASK]: \n```\n{masked_text}\n```\n Please anonymize these Personal Identity Identifiers by replacing the '[MASK]' with random placeholders while preserving the context so that the text can be used for analysis."
+    # print(prompt)
+    # response = model.generate_content(prompt)
+    # return response.text
+    API_URL = "https://api-inference.huggingface.co/models/pranavraj1103/ksp-mask-model"
+    headers = {"Authorization": f"Bearer {HUGGINGFACE_KEY}"}
+    def query(payload):
+        response = requests.post(API_URL, headers=headers, json=payload)
+        return response.json()
+    output = query({
+        "inputs": "The <mask> to the universe is <mask>.",
+    })
+    return output
+@app.post("/parse_doc")
+async def parse_doc(file: UploadFile):
+    if file.filename.endswith(".txt"):
+        return file.file.read()
+    doc = docx.Document(file.file)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
+    return "\n".join(full_text)
+@app.post("/presidio_mask")
+async def presidio_mask(text):
+    results = analyzer.analyze(text=text, language='en')
+    # for rec in results:
+    #     print(rec.start)
+    # print(*[text[res.start : res.end] for res in results])
+    # anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)
+    # return anonymized_text, results
+    return_list = []
+    seen_set = set()
+    for rec in results:
+        if (rec.score < 0.1) or (rec.start, rec.end) in seen_set:
+            continue
+        return_list.append({
+            "start": rec.start,
+            "end": rec.end,
+            "entity_type": rec.entity_type,
+            "text": text[rec.start:rec.end],
+            "score": rec.score,
+        })
+        seen_set.add((rec.start, rec.end))
+    return return_list

note.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+#downloading spacy model
+python -m spacy download en_core_web_lg
+sample_text = "My phone number is 212-555-5555, and my friend number is 9876543210"
+sample_text_2 = """The text in the image is a police report from the Amengad Police Station in Bagalkot, Karnataka, India. The report is dated 10-11-2022 and is about a man named Ramasawamy. The report states that Ramasawamy is a "rowdy" and a "habitual offender" who "disturbs public peace in public places." The report also states that Ramasawamy is "under surveillance."
+The report is signed by a police officer named SOMAPPA. The report is also stamped with the seal of the Amengad Police Station.
+Police Report Police Station:
+Amengad PS Case Number: 2022000003
+Date: 10-11-2022
+Subject: Ramasawamy
+Details: The accused is a rowdy and a habitual offender. He disturbs public peace in public places. He is under surveillance.
+Action Taken: The accused has been warned. He has been told to stop disturbing public peace.
+Signature: SOMAPPA Police
+Officer Seal: Amengad Police Station"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+annotated-types==0.6.0
+anyio==4.3.0
+blis==0.7.11
+catalogue==2.0.10
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+colorama==0.4.6
+confection==0.1.4
+cymem==2.0.8
+dnspython==2.6.1
+email_validator==2.1.1
+en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
+fastapi==0.111.0
+fastapi-cli==0.0.3
+filelock==3.14.0
+fsspec==2024.5.0
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.0
+idna==3.7
+Jinja2==3.1.4
+langcodes==3.4.0
+language_data==1.2.0
+lxml==5.2.2
+marisa-trie==1.1.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+murmurhash==1.0.10
+numpy==1.26.4
+orjson==3.10.3
+packaging==24.0
+phonenumbers==8.13.37
+pillow==10.3.0
+preshed==3.0.9
+presidio-analyzer==2.2.354
+presidio-anonymizer==2.2.354
+pycryptodome==3.20.0
+pydantic==2.7.1
+pydantic_core==2.18.2
+Pygments==2.18.0
+python-docx==1.1.2
+python-dotenv==1.0.1
+python-multipart==0.0.9
+PyYAML==6.0.1
+regex==2024.5.15
+requests==2.31.0
+requests-file==2.0.0
+rich==13.7.1
+safetensors==0.4.3
+shellingham==1.5.4
+smart-open==6.4.0
+sniffio==1.3.1
+spacy==3.7.4
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
+starlette==0.37.2
+thinc==8.2.3
+tldextract==5.1.2
+tokenizers==0.19.1
+tqdm==4.66.4
+transformers==4.40.2
+typer==0.9.4
+typing_extensions==4.11.0
+ujson==5.10.0
+urllib3==2.2.1
+uvicorn==0.29.0
+wasabi==1.1.2
+watchfiles==0.21.0
+weasel==0.3.4
+websockets==12.0

run.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+import sys
+import argparse
+from typing import List, Optional, Union
+import uvicorn
+if __name__ == "__main__":
+     uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True)