Add already indexed doc handling
Browse files
app.py
CHANGED
@@ -1,16 +1,14 @@
|
|
1 |
-
import requests
|
2 |
-
|
3 |
-
import zipfile
|
4 |
from io import BytesIO
|
5 |
-
import
|
6 |
-
import
|
7 |
-
import
|
8 |
-
import warnings
|
9 |
from fastapi import FastAPI, HTTPException
|
10 |
from fastapi.middleware.cors import CORSMiddleware
|
11 |
from pydantic import BaseModel
|
12 |
|
13 |
-
|
14 |
|
15 |
app = FastAPI(title="3GPP Specification Splitter API",
|
16 |
description="API to split and display specifications by their chapters & sub-chapters",
|
@@ -28,12 +26,36 @@ app.add_middleware(
|
|
28 |
allow_headers=["*"],
|
29 |
)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
class SpecRequest(BaseModel):
|
32 |
spec_id: str
|
33 |
|
34 |
@app.post("/get_full_text")
|
35 |
def get_text(request: SpecRequest):
|
36 |
specification = request.spec_id
|
|
|
|
|
|
|
37 |
total_file = []
|
38 |
url = requests.post(
|
39 |
"https://organizedprogrammers-3gppdocfinder.hf.space/find",
|
@@ -92,6 +114,8 @@ def get_text(request: SpecRequest):
|
|
92 |
|
93 |
@app.post("/get_spec_content")
|
94 |
def get_spec_content(request: SpecRequest):
|
|
|
|
|
95 |
text = get_text(request)
|
96 |
chapters = []
|
97 |
chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
|
|
|
1 |
+
import requests, os, zipfile, subprocess, re, warnings
|
2 |
+
warnings.filterwarnings("ignore")
|
|
|
3 |
from io import BytesIO
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from datasets import load_dataset
|
6 |
+
from huggingface_hub import login
|
|
|
7 |
from fastapi import FastAPI, HTTPException
|
8 |
from fastapi.middleware.cors import CORSMiddleware
|
9 |
from pydantic import BaseModel
|
10 |
|
11 |
+
load_dotenv()
|
12 |
|
13 |
app = FastAPI(title="3GPP Specification Splitter API",
|
14 |
description="API to split and display specifications by their chapters & sub-chapters",
|
|
|
26 |
allow_headers=["*"],
|
27 |
)
|
28 |
|
29 |
+
spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
|
30 |
+
spec_contents = spec_contents["train"].to_list()
|
31 |
+
|
32 |
+
def is_doc_indexed(spec_id: str):
|
33 |
+
return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
|
34 |
+
|
35 |
+
def get_full_doc(spec_id: str):
|
36 |
+
doc = []
|
37 |
+
for spec in spec_contents:
|
38 |
+
if spec["doc_id"] == spec_id:
|
39 |
+
doc.append(f"{spec['section']}\n{spec['content']}")
|
40 |
+
return "\n\n".join(doc)
|
41 |
+
|
42 |
+
def get_structured_doc(spec_id: str):
|
43 |
+
doc = {}
|
44 |
+
for spec in spec_contents:
|
45 |
+
if spec["doc_id"] == spec_id:
|
46 |
+
doc[spec["section"]] = spec["content"]
|
47 |
+
return doc
|
48 |
+
|
49 |
+
|
50 |
class SpecRequest(BaseModel):
|
51 |
spec_id: str
|
52 |
|
53 |
@app.post("/get_full_text")
|
54 |
def get_text(request: SpecRequest):
|
55 |
specification = request.spec_id
|
56 |
+
if is_doc_indexed(specification):
|
57 |
+
return get_full_doc(specification)
|
58 |
+
print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
|
59 |
total_file = []
|
60 |
url = requests.post(
|
61 |
"https://organizedprogrammers-3gppdocfinder.hf.space/find",
|
|
|
114 |
|
115 |
@app.post("/get_spec_content")
|
116 |
def get_spec_content(request: SpecRequest):
|
117 |
+
if is_doc_indexed(request.spec_id):
|
118 |
+
return get_structured_doc(request.spec_id)
|
119 |
text = get_text(request)
|
120 |
chapters = []
|
121 |
chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
|