File size: 4,656 Bytes
7f4e86d
 
a5c3314
7f4e86d
 
 
7adc29d
 
 
 
7f4e86d
7adc29d
 
a5c3314
 
7adc29d
 
 
 
 
 
 
 
 
 
 
 
 
7f4e86d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5c3314
 
 
 
 
 
7f4e86d
 
 
a5c3314
 
 
 
 
 
 
a7f45db
a5c3314
 
7adc29d
a5c3314
7adc29d
a5c3314
7adc29d
a5c3314
 
7adc29d
a5c3314
 
 
 
 
 
 
 
7adc29d
a5c3314
 
 
 
 
 
 
 
 
7adc29d
 
 
 
 
 
 
 
a5c3314
 
7adc29d
a5c3314
7adc29d
 
a5c3314
 
 
7adc29d
a5c3314
7adc29d
a5c3314
 
7f4e86d
 
a5c3314
f0a2998
a5c3314
f0a2998
a5c3314
 
 
f0a2998
 
a5c3314
 
 
 
 
7adc29d
a5c3314
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import requests, os, zipfile, subprocess, re, warnings
warnings.filterwarnings("ignore")
from io import BytesIO
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

load_dotenv()

app = FastAPI(title="3GPP Specification Splitter API", 
              description="API to split and display specifications by their chapters & sub-chapters",
              docs_url="/")

origins = [
    "*",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
spec_contents = spec_contents["train"].to_list()

def is_doc_indexed(spec_id: str):
    return any([True if spec_id == s["doc_id"] else False for s in spec_contents])

def get_full_doc(spec_id: str):
    doc = []
    for spec in spec_contents:
        if spec["doc_id"] == spec_id:
            doc.append(f"{spec['section']}\n{spec['content']}")
    return "\n\n".join(doc)

def get_structured_doc(spec_id: str):
    doc = {}
    for spec in spec_contents:
        if spec["doc_id"] == spec_id:
            doc[spec["section"]] = spec["content"]
    return doc


class SpecRequest(BaseModel):
    spec_id: str

@app.post("/get_full_text")
def get_text(request: SpecRequest):
    specification = request.spec_id
    if is_doc_indexed(specification):
        return get_full_doc(specification)
    print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
    total_file = []
    url = requests.post(
        "https://organizedprogrammers-3gppdocfinder.hf.space/find",
        verify=False,
        headers={"Content-Type": "application/json"},
        json={"doc_id": specification}
    )

    if url.status_code != 200:
        raise HTTPException(404, detail="Not found")
    
    url = url.json()['url']
    response = requests.get(
        url,
        verify=False,
        headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}

    )

    zip_bytes = BytesIO(response.content)
    current_zip_file = zipfile.ZipFile(zip_bytes)
    for file_info in current_zip_file.infolist():
        if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1:
            nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename))
            current_zip_file = zipfile.ZipFile(nested_zip_bytes)
            break
    
    for file_info in current_zip_file.infolist():
        filename = file_info.filename
        if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
            doc_bytes = current_zip_file.read(filename)
            ext = filename.split(".")[-1]
            input_path = f"/tmp/{specification}.{ext}"
            output_path = f"/tmp/{specification}.txt"
            with open(input_path, "wb") as f:
                f.write(doc_bytes)
                        
                subprocess.run([
                    "libreoffice",
                    "--headless",
                    "--convert-to", "txt",
                    "--outdir", "/tmp",
                    input_path
                ], check=True)
                        
                with open(output_path, "r") as f:
                    txt_data = [line.strip() for line in f if line.strip()]
                        
                os.remove(input_path)
                os.remove(output_path)
                total_file.extend(txt_data)
    if total_file == []:
        raise HTTPException(status_code=404, detail="Not found !")
    else:
        return total_file

@app.post("/get_spec_content")
def get_spec_content(request: SpecRequest):
    if is_doc_indexed(request.spec_id):
        return get_structured_doc(request.spec_id)
    text = get_text(request)
    chapters = []
    chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")

    for i, line in enumerate(text):
        if chapter_regex.fullmatch(line):
            chapters.append((i, line))

    document = {}
    for i in range(len(chapters)):
        start_index, chapter_title = chapters[i]
        end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
        content_lines = text[start_index + 1 : end_index]
        document[chapter_title.replace('\t', " ")] = "\n".join(content_lines)
    
    return document