cta2106
commited on
Commit
·
6431a8f
1
Parent(s):
a532d6c
first commit
Browse files- .DS_Store +0 -0
- .idea/.gitignore +8 -0
- api.py +42 -0
- appsearch.py +106 -0
- config.py +1 -0
- requirements.txt +6 -0
- utils.py +23 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
api.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from utils import get_us_speeches
|
3 |
+
from config import UPDATE_SPEECHES
|
4 |
+
|
5 |
+
from haystack.document_stores import ElasticsearchDocumentStore
|
6 |
+
from haystack.nodes import ElasticsearchRetriever
|
7 |
+
from haystack.nodes import FARMReader
|
8 |
+
from haystack.pipelines import ExtractiveQAPipeline
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
|
13 |
+
document_store = ElasticsearchDocumentStore(
|
14 |
+
host='fgm-v2.es.eastus2.azure.elastic-cloud.com',
|
15 |
+
username='elastic',
|
16 |
+
password='cxjWqZfmhcfhzpWmfX57ylJc',
|
17 |
+
scheme='https',
|
18 |
+
port=9243,
|
19 |
+
index='us-speeches'
|
20 |
+
|
21 |
+
)
|
22 |
+
|
23 |
+
if UPDATE_SPEECHES:
|
24 |
+
us_speeches = get_us_speeches()
|
25 |
+
document_store.write_documents(us_speeches)
|
26 |
+
|
27 |
+
retriever = ElasticsearchRetriever(
|
28 |
+
document_store=document_store
|
29 |
+
)
|
30 |
+
|
31 |
+
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
|
32 |
+
|
33 |
+
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
|
34 |
+
|
35 |
+
app = FastAPI()
|
36 |
+
|
37 |
+
|
38 |
+
async def run_query(query: str):
|
39 |
+
return pipeline.run(query=query)
|
40 |
+
|
41 |
+
|
42 |
+
gr.Interface(predict_fn, "textbox", ["label", "label"]).launch()
|
appsearch.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import List, Union, Dict
|
3 |
+
from urllib.parse import urljoin
|
4 |
+
import requests
|
5 |
+
|
6 |
+
|
7 |
+
class AppSearchClient:
|
8 |
+
def __init__(self):
|
9 |
+
self.appsearch_endpoint = "https://fgm-v2.ent.eastus2.azure.elastic-cloud.com"
|
10 |
+
self.appsearch_private_key = "private-dzf1pbcssw97hxkm3wxbdrpu"
|
11 |
+
self.headers = {
|
12 |
+
"Content-Type": "application/json",
|
13 |
+
"Authorization": f"Bearer {self.appsearch_private_key}",
|
14 |
+
}
|
15 |
+
assert self.appsearch_endpoint is not None
|
16 |
+
assert self.appsearch_private_key is not None
|
17 |
+
|
18 |
+
def list_all_engines(self) -> List[str]:
|
19 |
+
ENGINES_URL = "/api/as/v1/engines/"
|
20 |
+
request_url = urljoin(self.appsearch_endpoint, ENGINES_URL)
|
21 |
+
MAX_DOCS_PER_PAGE = 10
|
22 |
+
current_page = 1
|
23 |
+
while True:
|
24 |
+
params = (
|
25 |
+
("page[size]", f"{MAX_DOCS_PER_PAGE}"),
|
26 |
+
("page[current]", f"{current_page}"),
|
27 |
+
)
|
28 |
+
r = requests.get(request_url, headers=self.headers, params=params).json()
|
29 |
+
for item in r["results"]:
|
30 |
+
yield item["name"]
|
31 |
+
current_page += 1
|
32 |
+
if not len(r["results"]):
|
33 |
+
break
|
34 |
+
|
35 |
+
def create_engine(self, name) -> requests.Response:
|
36 |
+
ENGINES_URL = "/api/as/v1/engines/"
|
37 |
+
request_url = urljoin(self.appsearch_endpoint, ENGINES_URL)
|
38 |
+
data = json.dumps({"name": name}, indent=4, sort_keys=True)
|
39 |
+
r = requests.post(request_url, headers=self.headers, data=data)
|
40 |
+
return r
|
41 |
+
|
42 |
+
def index_documents(self, data: Union[Dict, List[Dict]], engine_name: str) -> None:
|
43 |
+
INDEX_URL = f"/api/as/v1/engines/{engine_name}/documents"
|
44 |
+
request_url = urljoin(self.appsearch_endpoint, INDEX_URL)
|
45 |
+
r = requests.post(
|
46 |
+
request_url,
|
47 |
+
headers=self.headers,
|
48 |
+
data=json.dumps(data, indent=4, sort_keys=True),
|
49 |
+
)
|
50 |
+
|
51 |
+
def list_existing_docs(self, engine_name) -> List[Dict]:
|
52 |
+
LIST_URL = f"/api/as/v1/engines/{engine_name}/documents/list"
|
53 |
+
MAX_DOCS_PER_PAGE = 100
|
54 |
+
request_url = urljoin(self.appsearch_endpoint, LIST_URL)
|
55 |
+
current_page = 1
|
56 |
+
docs = list()
|
57 |
+
while True:
|
58 |
+
params = (
|
59 |
+
("page[size]", f"{MAX_DOCS_PER_PAGE}"),
|
60 |
+
("page[current]", f"{current_page}"),
|
61 |
+
)
|
62 |
+
page_content = json.loads(
|
63 |
+
requests.get(request_url, headers=self.headers, params=params).text
|
64 |
+
)["results"]
|
65 |
+
docs.extend(page_content)
|
66 |
+
current_page += 1
|
67 |
+
if not page_content:
|
68 |
+
break
|
69 |
+
return docs
|
70 |
+
|
71 |
+
def list_existing_manual_urls(self, engine_name: str) -> List[Dict]:
|
72 |
+
for doc in self.list_existing_docs(engine_name):
|
73 |
+
if doc["is_manual"] == "true":
|
74 |
+
yield doc["id"]
|
75 |
+
|
76 |
+
def list_existing_non_manual_urls(self, engine_name: str) -> List[Dict]:
|
77 |
+
for doc in self.list_existing_docs(engine_name):
|
78 |
+
if doc["is_manual"] == "false":
|
79 |
+
yield doc["id"]
|
80 |
+
|
81 |
+
def list_existing_urls(self, engine_name: str) -> List[str]:
|
82 |
+
for doc in self.list_existing_docs(engine_name):
|
83 |
+
yield doc["id"]
|
84 |
+
|
85 |
+
def get_elastic_query(self, data: str, size: int):
|
86 |
+
return requests.post(
|
87 |
+
url=f"{self.appsearch_endpoint}/api/as/v0/engines/us-speeches-s/elasticsearch/_search?size={size}",
|
88 |
+
headers=self.headers, data=data)
|
89 |
+
|
90 |
+
def delete_existing_non_manual_docs(self, engine_name: str) -> None:
|
91 |
+
non_manual_doc_ids = list(self.list_existing_non_manual_urls(engine_name))
|
92 |
+
DELETE_URL = f"/api/as/v1/engines/{engine_name}/documents"
|
93 |
+
MAX_DOCS_TO_DELETE_PER_REQUEST = 100
|
94 |
+
request_url = urljoin(self.appsearch_endpoint, DELETE_URL)
|
95 |
+
|
96 |
+
def chunker(seq, size):
|
97 |
+
return (seq[pos: pos + size] for pos in range(0, len(seq), size))
|
98 |
+
|
99 |
+
for idx, group in enumerate(
|
100 |
+
chunker(non_manual_doc_ids, MAX_DOCS_TO_DELETE_PER_REQUEST)
|
101 |
+
):
|
102 |
+
r = requests.delete(
|
103 |
+
request_url,
|
104 |
+
headers=self.headers,
|
105 |
+
data=json.dumps(group, indent=4, sort_keys=True),
|
106 |
+
)
|
config.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
UPDATE_SPEECHES = False
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
elastic-enterprise-search==8.4.0
|
2 |
+
farm-haystack
|
3 |
+
requests~=2.28.1
|
4 |
+
fastapi~=0.86.0
|
5 |
+
torch
|
6 |
+
torchvision
|
utils.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict
|
2 |
+
|
3 |
+
from appsearch import AppSearchClient
|
4 |
+
|
5 |
+
|
6 |
+
def get_us_speeches() -> List[Dict]:
|
7 |
+
appsearch = AppSearchClient()
|
8 |
+
|
9 |
+
us_speeches = appsearch.list_existing_docs("us-speeches")
|
10 |
+
|
11 |
+
for items in us_speeches:
|
12 |
+
if "_meta" in items:
|
13 |
+
del items["_meta"]
|
14 |
+
|
15 |
+
us_speeches_dict = [
|
16 |
+
{
|
17 |
+
'content': speech["text"],
|
18 |
+
'meta': {'filename': speech["filename"], 'speaker': speech["speaker"], 'date': speech["date"],
|
19 |
+
'url': speech["url"]}
|
20 |
+
} for speech in us_speeches
|
21 |
+
]
|
22 |
+
|
23 |
+
return us_speeches_dict
|