Spaces:

OrganizedProgrammers
/

3GPPDocFinder

Running

App Files Files Community

Omar ID EL MOUMEN commited on 7 days ago

Commit

366638d

1 Parent(s): b215aa8

Change method of extracting scope (docx to pdf -> docx to txt)

Browse files

Files changed (1) hide show

app.py +82 -34

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import requests
 from bs4 import BeautifulSoup
 import json
 import os
-import pymupdf as fitz
 import uuid
 import zipfile
 import io
@@ -44,62 +44,110 @@ app.add_middleware(
     allow_headers=["*"],
 )
-def get_pdf_bytes(specification: str, version: str):
     doc_id = specification
     series = doc_id.split(".")[0]
-    response = requests.get(f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip", verify=False)
     if response.status_code != 200:
-        raise Exception("Téléchargement du ZIP échoué")
     zip_bytes = io.BytesIO(response.content)
     with zipfile.ZipFile(zip_bytes) as zf:
         for file_name in zf.namelist():
-            if file_name.endswith("doc") or file_name.endswith("docx"):
                 ext = file_name.split(".")[-1]
                 doc_bytes = zf.read(file_name)
                 temp_id = str(uuid.uuid4())
                 input_path = f"/tmp/{temp_id}.{ext}"
-                output_path = f"/tmp/{temp_id}.pdf"
                 with open(input_path, "wb") as f:
                     f.write(doc_bytes)
                 subprocess.run([
                     "libreoffice",
                     "--headless",
-                    "--convert-to", "pdf",
                     "--outdir", "/tmp",
                     input_path
                 ], check=True)
-                with open(output_path, "rb") as f:
-                    pdf_data = f.read()
                 os.remove(input_path)
                 os.remove(output_path)
-                return io.BytesIO(pdf_data)
-        raise Exception("Aucun fichier .doc/.docx trouvé dans le ZIP")
 def get_scope(specification: str, version: str):
-    pdf_bytes = get_pdf_bytes(specification, version)
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    for content in doc.get_toc():
-        if "scope" in content[1].lower():
-            page_num = content[2] - 1
-            break
-    doc = doc[page_num:]
-    pdf_full_text = " ".join(page.get_text("text") for page in doc)
-    pdf_postprocess_text = re.sub(r"\s+", " ", pdf_full_text)
-    pdf_postprocess_text = pdf_postprocess_text.replace("1 Scope", " !-! ")
-    pdf_postprocess_text = pdf_postprocess_text.replace("2 Reference", " !-! ")
-    pdf_postprocess_text = pdf_postprocess_text.replace("", "- ")
-    return pdf_postprocess_text.split(" !-! ")[1]
 class DocRequest(BaseModel):
     doc_id: str

 from bs4 import BeautifulSoup
 import json
 import os
+import traceback
 import uuid
 import zipfile
 import io
     allow_headers=["*"],
 )
+def get_text(specification: str, version: str):
+    """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
     doc_id = specification
     series = doc_id.split(".")[0]
+    response = requests.get(
+        f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
+        verify=False,
+        headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
+    )
     if response.status_code != 200:
+        raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
     zip_bytes = io.BytesIO(response.content)
     with zipfile.ZipFile(zip_bytes) as zf:
         for file_name in zf.namelist():
+            if file_name.endswith("zip"):
+                print("Another ZIP !")
+                zip_bytes = io.BytesIO(zf.read(file_name))
+                zf = zipfile.ZipFile(zip_bytes)
+                for file_name2 in zf.namelist():
+                    if file_name2.endswith("doc") or file_name2.endswith("docx"):
+                        if "cover" in file_name2.lower():
+                            print("COVER !")
+                            continue
+                        ext = file_name2.split(".")[-1]
+                        doc_bytes = zf.read(file_name2)
+                        temp_id = str(uuid.uuid4())
+                        input_path = f"/tmp/{temp_id}.{ext}"
+                        output_path = f"/tmp/{temp_id}.txt"
+                        with open(input_path, "wb") as f:
+                            f.write(doc_bytes)
+                        subprocess.run([
+                            "libreoffice",
+                            "--headless",
+                            "--convert-to", "txt",
+                            "--outdir", "/tmp",
+                            input_path
+                        ], check=True)
+                        with open(output_path, "r") as f:
+                            txt_data = [line.strip() for line in f if line.strip()]
+                        os.remove(input_path)
+                        os.remove(output_path)
+                        return txt_data
+            elif file_name.endswith("doc") or file_name.endswith("docx"):
+                if "cover" in file_name.lower():
+                    print("COVER !")
+                    continue
                 ext = file_name.split(".")[-1]
                 doc_bytes = zf.read(file_name)
                 temp_id = str(uuid.uuid4())
                 input_path = f"/tmp/{temp_id}.{ext}"
+                output_path = f"/tmp/{temp_id}.txt"
+                print("Ecriture")
                 with open(input_path, "wb") as f:
                     f.write(doc_bytes)
+                print("Convertissement")
                 subprocess.run([
                     "libreoffice",
                     "--headless",
+                    "--convert-to", "txt",
                     "--outdir", "/tmp",
                     input_path
                 ], check=True)
+                print("Ecriture TXT")
+                with open(output_path, "r", encoding="utf-8") as f:
+                    txt_data = [line.strip() for line in f if line.strip()]
                 os.remove(input_path)
                 os.remove(output_path)
+                return txt_data
+    raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
 def get_scope(specification: str, version: str):
+    try:
+        spec_text = get_text(specification, version)
+        scp_i = 0
+        for x in range(len(spec_text)):
+            text = spec_text[x]
+            if re.search(r"^\d\s+scope$", text.lower()):
+                scp_i = x
+                break
+        nxt_i = scp_i + 10
+        for x in range(len(spec_text[scp_i:])):
+            text = spec_text[x]
+            if re.search(r"^\d\s+references$", text.lower()):
+                nxt_i = x
+                break
+        return re.sub(r"\s+", " ", " ".join(spec_text[scp_i+1:nxt_i]))
+    except Exception as e:
+        traceback.print_exception(e)
+        return f"Erreur lors de l'extraction du scope: {str(e)}"
 class DocRequest(BaseModel):
     doc_id: str