Omar ID EL MOUMEN
commited on
Commit
·
366638d
1
Parent(s):
b215aa8
Change method of extracting scope (docx to pdf -> docx to txt)
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import requests
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
import json
|
7 |
import os
|
8 |
-
import
|
9 |
import uuid
|
10 |
import zipfile
|
11 |
import io
|
@@ -44,62 +44,110 @@ app.add_middleware(
|
|
44 |
allow_headers=["*"],
|
45 |
)
|
46 |
|
47 |
-
def
|
|
|
48 |
doc_id = specification
|
49 |
series = doc_id.split(".")[0]
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
if response.status_code != 200:
|
52 |
-
raise Exception("Téléchargement du ZIP échoué")
|
53 |
-
|
54 |
zip_bytes = io.BytesIO(response.content)
|
55 |
-
|
56 |
with zipfile.ZipFile(zip_bytes) as zf:
|
57 |
for file_name in zf.namelist():
|
58 |
-
if file_name.endswith("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
ext = file_name.split(".")[-1]
|
60 |
doc_bytes = zf.read(file_name)
|
61 |
temp_id = str(uuid.uuid4())
|
62 |
input_path = f"/tmp/{temp_id}.{ext}"
|
63 |
-
output_path = f"/tmp/{temp_id}.
|
64 |
-
|
|
|
65 |
with open(input_path, "wb") as f:
|
66 |
f.write(doc_bytes)
|
67 |
-
|
|
|
68 |
subprocess.run([
|
69 |
"libreoffice",
|
70 |
"--headless",
|
71 |
-
"--convert-to", "
|
72 |
"--outdir", "/tmp",
|
73 |
input_path
|
74 |
], check=True)
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
78 |
|
79 |
os.remove(input_path)
|
80 |
os.remove(output_path)
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
|
85 |
def get_scope(specification: str, version: str):
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
103 |
|
104 |
class DocRequest(BaseModel):
|
105 |
doc_id: str
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
import json
|
7 |
import os
|
8 |
+
import traceback
|
9 |
import uuid
|
10 |
import zipfile
|
11 |
import io
|
|
|
44 |
allow_headers=["*"],
|
45 |
)
|
46 |
|
47 |
+
def get_text(specification: str, version: str):
|
48 |
+
"""Récupère les bytes du PDF à partir d'une spécification et d'une version."""
|
49 |
doc_id = specification
|
50 |
series = doc_id.split(".")[0]
|
51 |
+
|
52 |
+
response = requests.get(
|
53 |
+
f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
|
54 |
+
verify=False,
|
55 |
+
headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
56 |
+
)
|
57 |
+
|
58 |
if response.status_code != 200:
|
59 |
+
raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
|
60 |
+
|
61 |
zip_bytes = io.BytesIO(response.content)
|
62 |
+
|
63 |
with zipfile.ZipFile(zip_bytes) as zf:
|
64 |
for file_name in zf.namelist():
|
65 |
+
if file_name.endswith("zip"):
|
66 |
+
print("Another ZIP !")
|
67 |
+
zip_bytes = io.BytesIO(zf.read(file_name))
|
68 |
+
zf = zipfile.ZipFile(zip_bytes)
|
69 |
+
for file_name2 in zf.namelist():
|
70 |
+
if file_name2.endswith("doc") or file_name2.endswith("docx"):
|
71 |
+
if "cover" in file_name2.lower():
|
72 |
+
print("COVER !")
|
73 |
+
continue
|
74 |
+
ext = file_name2.split(".")[-1]
|
75 |
+
doc_bytes = zf.read(file_name2)
|
76 |
+
temp_id = str(uuid.uuid4())
|
77 |
+
input_path = f"/tmp/{temp_id}.{ext}"
|
78 |
+
output_path = f"/tmp/{temp_id}.txt"
|
79 |
+
|
80 |
+
with open(input_path, "wb") as f:
|
81 |
+
f.write(doc_bytes)
|
82 |
+
|
83 |
+
subprocess.run([
|
84 |
+
"libreoffice",
|
85 |
+
"--headless",
|
86 |
+
"--convert-to", "txt",
|
87 |
+
"--outdir", "/tmp",
|
88 |
+
input_path
|
89 |
+
], check=True)
|
90 |
+
|
91 |
+
with open(output_path, "r") as f:
|
92 |
+
txt_data = [line.strip() for line in f if line.strip()]
|
93 |
+
|
94 |
+
os.remove(input_path)
|
95 |
+
os.remove(output_path)
|
96 |
+
return txt_data
|
97 |
+
elif file_name.endswith("doc") or file_name.endswith("docx"):
|
98 |
+
if "cover" in file_name.lower():
|
99 |
+
print("COVER !")
|
100 |
+
continue
|
101 |
ext = file_name.split(".")[-1]
|
102 |
doc_bytes = zf.read(file_name)
|
103 |
temp_id = str(uuid.uuid4())
|
104 |
input_path = f"/tmp/{temp_id}.{ext}"
|
105 |
+
output_path = f"/tmp/{temp_id}.txt"
|
106 |
+
|
107 |
+
print("Ecriture")
|
108 |
with open(input_path, "wb") as f:
|
109 |
f.write(doc_bytes)
|
110 |
+
|
111 |
+
print("Convertissement")
|
112 |
subprocess.run([
|
113 |
"libreoffice",
|
114 |
"--headless",
|
115 |
+
"--convert-to", "txt",
|
116 |
"--outdir", "/tmp",
|
117 |
input_path
|
118 |
], check=True)
|
119 |
+
|
120 |
+
print("Ecriture TXT")
|
121 |
+
with open(output_path, "r", encoding="utf-8") as f:
|
122 |
+
txt_data = [line.strip() for line in f if line.strip()]
|
123 |
|
124 |
os.remove(input_path)
|
125 |
os.remove(output_path)
|
126 |
+
return txt_data
|
127 |
+
|
128 |
+
raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
|
129 |
|
130 |
def get_scope(specification: str, version: str):
|
131 |
+
try:
|
132 |
+
spec_text = get_text(specification, version)
|
133 |
+
scp_i = 0
|
134 |
+
for x in range(len(spec_text)):
|
135 |
+
text = spec_text[x]
|
136 |
+
if re.search(r"^\d\s+scope$", text.lower()):
|
137 |
+
scp_i = x
|
138 |
+
break
|
139 |
+
|
140 |
+
nxt_i = scp_i + 10
|
141 |
+
for x in range(len(spec_text[scp_i:])):
|
142 |
+
text = spec_text[x]
|
143 |
+
if re.search(r"^\d\s+references$", text.lower()):
|
144 |
+
nxt_i = x
|
145 |
+
break
|
146 |
+
|
147 |
+
return re.sub(r"\s+", " ", " ".join(spec_text[scp_i+1:nxt_i]))
|
148 |
+
except Exception as e:
|
149 |
+
traceback.print_exception(e)
|
150 |
+
return f"Erreur lors de l'extraction du scope: {str(e)}"
|
151 |
|
152 |
class DocRequest(BaseModel):
|
153 |
doc_id: str
|