Spaces:

OrganizedProgrammers
/

3GPPSpecSplitter

Running

App Files Files Community

om4r932 commited on 2 days ago

Commit

7f4e86d

1 Parent(s): a5c3314

Add already indexed doc handling

Browse files

Files changed (1) hide show

app.py +32 -8

app.py CHANGED Viewed

@@ -1,16 +1,14 @@
-import requests
-import os
-import zipfile
 from io import BytesIO
-import subprocess
-import os
-import re
-import warnings
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-warnings.filterwarnings("ignore")
 app = FastAPI(title="3GPP Specification Splitter API",
               description="API to split and display specifications by their chapters & sub-chapters",
@@ -28,12 +26,36 @@ app.add_middleware(
     allow_headers=["*"],
 )
 class SpecRequest(BaseModel):
     spec_id: str
 @app.post("/get_full_text")
 def get_text(request: SpecRequest):
     specification = request.spec_id
     total_file = []
     url = requests.post(
         "https://organizedprogrammers-3gppdocfinder.hf.space/find",
@@ -92,6 +114,8 @@ def get_text(request: SpecRequest):
 @app.post("/get_spec_content")
 def get_spec_content(request: SpecRequest):
     text = get_text(request)
     chapters = []
     chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")

+import requests, os, zipfile, subprocess, re, warnings
+warnings.filterwarnings("ignore")
 from io import BytesIO
+from dotenv import load_dotenv
+from datasets import load_dataset
+from huggingface_hub import login
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+load_dotenv()
 app = FastAPI(title="3GPP Specification Splitter API",
               description="API to split and display specifications by their chapters & sub-chapters",
     allow_headers=["*"],
 )
+spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
+spec_contents = spec_contents["train"].to_list()
+def is_doc_indexed(spec_id: str):
+    return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
+def get_full_doc(spec_id: str):
+    doc = []
+    for spec in spec_contents:
+        if spec["doc_id"] == spec_id:
+            doc.append(f"{spec['section']}\n{spec['content']}")
+    return "\n\n".join(doc)
+def get_structured_doc(spec_id: str):
+    doc = {}
+    for spec in spec_contents:
+        if spec["doc_id"] == spec_id:
+            doc[spec["section"]] = spec["content"]
+    return doc
 class SpecRequest(BaseModel):
     spec_id: str
 @app.post("/get_full_text")
 def get_text(request: SpecRequest):
     specification = request.spec_id
+    if is_doc_indexed(specification):
+        return get_full_doc(specification)
+    print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
     total_file = []
     url = requests.post(
         "https://organizedprogrammers-3gppdocfinder.hf.space/find",
 @app.post("/get_spec_content")
 def get_spec_content(request: SpecRequest):
+    if is_doc_indexed(request.spec_id):
+        return get_structured_doc(request.spec_id)
     text = get_text(request)
     chapters = []
     chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")