File size: 4,656 Bytes
7f4e86d a5c3314 7f4e86d 7adc29d 7f4e86d 7adc29d a5c3314 7adc29d 7f4e86d a5c3314 7f4e86d a5c3314 a7f45db a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7f4e86d a5c3314 f0a2998 a5c3314 f0a2998 a5c3314 f0a2998 a5c3314 7adc29d a5c3314 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import requests, os, zipfile, subprocess, re, warnings
warnings.filterwarnings("ignore")
from io import BytesIO
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
load_dotenv()
app = FastAPI(title="3GPP Specification Splitter API",
description="API to split and display specifications by their chapters & sub-chapters",
docs_url="/")
origins = [
"*",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
spec_contents = spec_contents["train"].to_list()
def is_doc_indexed(spec_id: str):
return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
def get_full_doc(spec_id: str):
doc = []
for spec in spec_contents:
if spec["doc_id"] == spec_id:
doc.append(f"{spec['section']}\n{spec['content']}")
return "\n\n".join(doc)
def get_structured_doc(spec_id: str):
doc = {}
for spec in spec_contents:
if spec["doc_id"] == spec_id:
doc[spec["section"]] = spec["content"]
return doc
class SpecRequest(BaseModel):
spec_id: str
@app.post("/get_full_text")
def get_text(request: SpecRequest):
specification = request.spec_id
if is_doc_indexed(specification):
return get_full_doc(specification)
print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
total_file = []
url = requests.post(
"https://organizedprogrammers-3gppdocfinder.hf.space/find",
verify=False,
headers={"Content-Type": "application/json"},
json={"doc_id": specification}
)
if url.status_code != 200:
raise HTTPException(404, detail="Not found")
url = url.json()['url']
response = requests.get(
url,
verify=False,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
)
zip_bytes = BytesIO(response.content)
current_zip_file = zipfile.ZipFile(zip_bytes)
for file_info in current_zip_file.infolist():
if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1:
nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename))
current_zip_file = zipfile.ZipFile(nested_zip_bytes)
break
for file_info in current_zip_file.infolist():
filename = file_info.filename
if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
doc_bytes = current_zip_file.read(filename)
ext = filename.split(".")[-1]
input_path = f"/tmp/{specification}.{ext}"
output_path = f"/tmp/{specification}.txt"
with open(input_path, "wb") as f:
f.write(doc_bytes)
subprocess.run([
"libreoffice",
"--headless",
"--convert-to", "txt",
"--outdir", "/tmp",
input_path
], check=True)
with open(output_path, "r") as f:
txt_data = [line.strip() for line in f if line.strip()]
os.remove(input_path)
os.remove(output_path)
total_file.extend(txt_data)
if total_file == []:
raise HTTPException(status_code=404, detail="Not found !")
else:
return total_file
@app.post("/get_spec_content")
def get_spec_content(request: SpecRequest):
if is_doc_indexed(request.spec_id):
return get_structured_doc(request.spec_id)
text = get_text(request)
chapters = []
chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
for i, line in enumerate(text):
if chapter_regex.fullmatch(line):
chapters.append((i, line))
document = {}
for i in range(len(chapters)):
start_index, chapter_title = chapters[i]
end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
content_lines = text[start_index + 1 : end_index]
document[chapter_title.replace('\t', " ")] = "\n".join(content_lines)
return document |