Spaces:

OrganizedProgrammers
/

3GPPSpecSplitter

Running

App Files Files Community

3GPPSpecSplitter / app.py

om4r932

Update methods

a5c3314 10 days ago

raw

history blame

3.63 kB

	import requests
	import os
	import zipfile
	from io import BytesIO
	import subprocess
	import os
	import re
	import warnings
	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel

	warnings.filterwarnings("ignore")

	app = FastAPI(title="3GPP Specification Splitter API",
	description="API to split and display specifications by their chapters & sub-chapters",
	docs_url="/")

	origins = [
	"*",
	]

	app.add_middleware(
	CORSMiddleware,
	allow_origins=origins,
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	class SpecRequest(BaseModel):
	spec_id: str

	@app.post("/get_full_text")
	def get_text(request: SpecRequest):
	specification = request.spec_id
	total_file = []
	url = requests.post(
	"https://organizedprogrammers-3gppdocfinder.hf.space/find",
	verify=False,
	headers={"Content-Type": "application/json"},
	json={"doc_id": specification}
	)

	if url.status_code != 200:
	raise HTTPException(404, detail="Not found")

	url = url.json()['url']
	response = requests.get(
	url,
	verify=False,
	headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}

	)

	zip_bytes = BytesIO(response.content)
	current_zip_file = zipfile.ZipFile(zip_bytes)
	for file_info in current_zip_file.infolist():
	if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1:
	nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename))
	current_zip_file = zipfile.ZipFile(nested_zip_bytes)
	break

	for file_info in current_zip_file.infolist():
	filename = file_info.filename
	if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
	doc_bytes = current_zip_file.read(filename)
	ext = filename.split(".")[-1]
	input_path = f"/tmp/{specification}.{ext}"
	output_path = f"/tmp/{specification}.txt"
	with open(input_path, "wb") as f:
	f.write(doc_bytes)

	subprocess.run([
	"libreoffice",
	"--headless",
	"--convert-to", "txt",
	"--outdir", "/tmp",
	input_path
	], check=True)

	with open(output_path, "r") as f:
	txt_data = [line.strip() for line in f if line.strip()]

	os.remove(input_path)
	os.remove(output_path)
	total_file.extend(txt_data)
	if total_file == []:
	raise HTTPException(status_code=404, detail="Not found !")
	else:
	return total_file

	@app.post("/get_spec_content")
	def get_spec_content(request: SpecRequest):
	text = get_text(request)
	chapters = []
	chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")

	for i, line in enumerate(text):
	if chapter_regex.fullmatch(line):
	chapters.append((i, line))

	document = {}
	for i in range(len(chapters)):
	start_index, chapter_title = chapters[i]
	end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
	content_lines = text[start_index + 1 : end_index]
	document[chapter_title.replace('\t', " ")] = "\n".join(content_lines)

	return document