import requests import os import zipfile from io import BytesIO import subprocess import os import re import warnings from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel warnings.filterwarnings("ignore") app = FastAPI(title="3GPP Specification Splitter API", description="API to split and display specifications by their chapters & sub-chapters", docs_url="/") origins = [ "*", ] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) class SpecRequest(BaseModel): spec_id: str @app.post("/get_full_text") def get_text(request: SpecRequest): specification = request.spec_id total_file = [] url = requests.post( "https://organizedprogrammers-3gppdocfinder.hf.space/find", verify=False, headers={"Content-Type": "application/json"}, json={"doc_id": specification} ) if url.status_code != 200: raise HTTPException(404, detail="Not found") url = url.json()['url'] response = requests.get( url, verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"} ) zip_bytes = BytesIO(response.content) current_zip_file = zipfile.ZipFile(zip_bytes) for file_info in current_zip_file.infolist(): if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1: nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename)) current_zip_file = zipfile.ZipFile(nested_zip_bytes) break for file_info in current_zip_file.infolist(): filename = file_info.filename if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()): doc_bytes = current_zip_file.read(filename) ext = filename.split(".")[-1] input_path = f"/tmp/{specification}.{ext}" output_path = f"/tmp/{specification}.txt" with open(input_path, "wb") as f: f.write(doc_bytes) subprocess.run([ "libreoffice", "--headless", "--convert-to", "txt", "--outdir", "/tmp", input_path ], check=True) with open(output_path, "r") as f: txt_data = [line.strip() for line in f if line.strip()] os.remove(input_path) os.remove(output_path) total_file.extend(txt_data) if total_file == []: raise HTTPException(status_code=404, detail="Not found !") else: return total_file @app.post("/get_spec_content") def get_spec_content(request: SpecRequest): text = get_text(request) chapters = [] chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$") for i, line in enumerate(text): if chapter_regex.fullmatch(line): chapters.append((i, line)) document = {} for i in range(len(chapters)): start_index, chapter_title = chapters[i] end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text) content_lines = text[start_index + 1 : end_index] document[chapter_title.replace('\t', " ")] = "\n".join(content_lines) return document