cta2106
/

fgmonitor-semantic

Model card Files Files and versions Community

cta2106 commited on Nov 11, 2022

Commit

6431a8f

1 Parent(s): a532d6c

first commit

Browse files

Files changed (7) hide show

.DS_Store +0 -0
.idea/.gitignore +8 -0
api.py +42 -0
appsearch.py +106 -0
config.py +1 -0
requirements.txt +6 -0
utils.py +23 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

api.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from fastapi import FastAPI
+from utils import get_us_speeches
+from config import UPDATE_SPEECHES
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.nodes import ElasticsearchRetriever
+from haystack.nodes import FARMReader
+from haystack.pipelines import ExtractiveQAPipeline
+import gradio as gr
+document_store = ElasticsearchDocumentStore(
+    host='fgm-v2.es.eastus2.azure.elastic-cloud.com',
+    username='elastic',
+    password='cxjWqZfmhcfhzpWmfX57ylJc',
+    scheme='https',
+    port=9243,
+    index='us-speeches'
+)
+if UPDATE_SPEECHES:
+    us_speeches = get_us_speeches()
+    document_store.write_documents(us_speeches)
+retriever = ElasticsearchRetriever(
+    document_store=document_store
+)
+reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
+pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
+app = FastAPI()
+async def run_query(query: str):
+    return pipeline.run(query=query)
+gr.Interface(predict_fn, "textbox", ["label", "label"]).launch()

appsearch.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import json
+from typing import List, Union, Dict
+from urllib.parse import urljoin
+import requests
+class AppSearchClient:
+    def __init__(self):
+        self.appsearch_endpoint = "https://fgm-v2.ent.eastus2.azure.elastic-cloud.com"
+        self.appsearch_private_key = "private-dzf1pbcssw97hxkm3wxbdrpu"
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.appsearch_private_key}",
+        }
+        assert self.appsearch_endpoint is not None
+        assert self.appsearch_private_key is not None
+    def list_all_engines(self) -> List[str]:
+        ENGINES_URL = "/api/as/v1/engines/"
+        request_url = urljoin(self.appsearch_endpoint, ENGINES_URL)
+        MAX_DOCS_PER_PAGE = 10
+        current_page = 1
+        while True:
+            params = (
+                ("page[size]", f"{MAX_DOCS_PER_PAGE}"),
+                ("page[current]", f"{current_page}"),
+            )
+            r = requests.get(request_url, headers=self.headers, params=params).json()
+            for item in r["results"]:
+                yield item["name"]
+            current_page += 1
+            if not len(r["results"]):
+                break
+    def create_engine(self, name) -> requests.Response:
+        ENGINES_URL = "/api/as/v1/engines/"
+        request_url = urljoin(self.appsearch_endpoint, ENGINES_URL)
+        data = json.dumps({"name": name}, indent=4, sort_keys=True)
+        r = requests.post(request_url, headers=self.headers, data=data)
+        return r
+    def index_documents(self, data: Union[Dict, List[Dict]], engine_name: str) -> None:
+        INDEX_URL = f"/api/as/v1/engines/{engine_name}/documents"
+        request_url = urljoin(self.appsearch_endpoint, INDEX_URL)
+        r = requests.post(
+            request_url,
+            headers=self.headers,
+            data=json.dumps(data, indent=4, sort_keys=True),
+        )
+    def list_existing_docs(self, engine_name) -> List[Dict]:
+        LIST_URL = f"/api/as/v1/engines/{engine_name}/documents/list"
+        MAX_DOCS_PER_PAGE = 100
+        request_url = urljoin(self.appsearch_endpoint, LIST_URL)
+        current_page = 1
+        docs = list()
+        while True:
+            params = (
+                ("page[size]", f"{MAX_DOCS_PER_PAGE}"),
+                ("page[current]", f"{current_page}"),
+            )
+            page_content = json.loads(
+                requests.get(request_url, headers=self.headers, params=params).text
+            )["results"]
+            docs.extend(page_content)
+            current_page += 1
+            if not page_content:
+                break
+        return docs
+    def list_existing_manual_urls(self, engine_name: str) -> List[Dict]:
+        for doc in self.list_existing_docs(engine_name):
+            if doc["is_manual"] == "true":
+                yield doc["id"]
+    def list_existing_non_manual_urls(self, engine_name: str) -> List[Dict]:
+        for doc in self.list_existing_docs(engine_name):
+            if doc["is_manual"] == "false":
+                yield doc["id"]
+    def list_existing_urls(self, engine_name: str) -> List[str]:
+        for doc in self.list_existing_docs(engine_name):
+            yield doc["id"]
+    def get_elastic_query(self, data: str, size: int):
+        return requests.post(
+            url=f"{self.appsearch_endpoint}/api/as/v0/engines/us-speeches-s/elasticsearch/_search?size={size}",
+            headers=self.headers, data=data)
+    def delete_existing_non_manual_docs(self, engine_name: str) -> None:
+        non_manual_doc_ids = list(self.list_existing_non_manual_urls(engine_name))
+        DELETE_URL = f"/api/as/v1/engines/{engine_name}/documents"
+        MAX_DOCS_TO_DELETE_PER_REQUEST = 100
+        request_url = urljoin(self.appsearch_endpoint, DELETE_URL)
+        def chunker(seq, size):
+            return (seq[pos: pos + size] for pos in range(0, len(seq), size))
+        for idx, group in enumerate(
+                chunker(non_manual_doc_ids, MAX_DOCS_TO_DELETE_PER_REQUEST)
+        ):
+            r = requests.delete(
+                request_url,
+                headers=self.headers,
+                data=json.dumps(group, indent=4, sort_keys=True),
+            )

config.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ UPDATE_SPEECHES = False

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+elastic-enterprise-search==8.4.0
+farm-haystack
+requests~=2.28.1
+fastapi~=0.86.0
+torch
+torchvision

utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import List, Dict
+from appsearch import AppSearchClient
+def get_us_speeches() -> List[Dict]:
+    appsearch = AppSearchClient()
+    us_speeches = appsearch.list_existing_docs("us-speeches")
+    for items in us_speeches:
+        if "_meta" in items:
+            del items["_meta"]
+    us_speeches_dict = [
+        {
+            'content': speech["text"],
+            'meta': {'filename': speech["filename"], 'speaker': speech["speaker"], 'date': speech["date"],
+                     'url': speech["url"]}
+        } for speech in us_speeches
+    ]
+    return us_speeches_dict