File size: 3,630 Bytes
7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 a7f45db a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 7adc29d a5c3314 f0a2998 a5c3314 f0a2998 a5c3314 f0a2998 a5c3314 7adc29d a5c3314 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import requests
import os
import zipfile
from io import BytesIO
import subprocess
import os
import re
import warnings
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
warnings.filterwarnings("ignore")
app = FastAPI(title="3GPP Specification Splitter API",
description="API to split and display specifications by their chapters & sub-chapters",
docs_url="/")
origins = [
"*",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class SpecRequest(BaseModel):
spec_id: str
@app.post("/get_full_text")
def get_text(request: SpecRequest):
specification = request.spec_id
total_file = []
url = requests.post(
"https://organizedprogrammers-3gppdocfinder.hf.space/find",
verify=False,
headers={"Content-Type": "application/json"},
json={"doc_id": specification}
)
if url.status_code != 200:
raise HTTPException(404, detail="Not found")
url = url.json()['url']
response = requests.get(
url,
verify=False,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
)
zip_bytes = BytesIO(response.content)
current_zip_file = zipfile.ZipFile(zip_bytes)
for file_info in current_zip_file.infolist():
if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1:
nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename))
current_zip_file = zipfile.ZipFile(nested_zip_bytes)
break
for file_info in current_zip_file.infolist():
filename = file_info.filename
if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
doc_bytes = current_zip_file.read(filename)
ext = filename.split(".")[-1]
input_path = f"/tmp/{specification}.{ext}"
output_path = f"/tmp/{specification}.txt"
with open(input_path, "wb") as f:
f.write(doc_bytes)
subprocess.run([
"libreoffice",
"--headless",
"--convert-to", "txt",
"--outdir", "/tmp",
input_path
], check=True)
with open(output_path, "r") as f:
txt_data = [line.strip() for line in f if line.strip()]
os.remove(input_path)
os.remove(output_path)
total_file.extend(txt_data)
if total_file == []:
raise HTTPException(status_code=404, detail="Not found !")
else:
return total_file
@app.post("/get_spec_content")
def get_spec_content(request: SpecRequest):
text = get_text(request)
chapters = []
chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
for i, line in enumerate(text):
if chapter_regex.fullmatch(line):
chapters.append((i, line))
document = {}
for i in range(len(chapters)):
start_index, chapter_title = chapters[i]
end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
content_lines = text[start_index + 1 : end_index]
document[chapter_title.replace('\t', " ")] = "\n".join(content_lines)
return document |