Omar ID EL MOUMEN commited on
Commit
366638d
·
1 Parent(s): b215aa8

Change method of extracting scope (docx to pdf -> docx to txt)

Browse files
Files changed (1) hide show
  1. app.py +82 -34
app.py CHANGED
@@ -5,7 +5,7 @@ import requests
5
  from bs4 import BeautifulSoup
6
  import json
7
  import os
8
- import pymupdf as fitz
9
  import uuid
10
  import zipfile
11
  import io
@@ -44,62 +44,110 @@ app.add_middleware(
44
  allow_headers=["*"],
45
  )
46
 
47
- def get_pdf_bytes(specification: str, version: str):
 
48
  doc_id = specification
49
  series = doc_id.split(".")[0]
50
- response = requests.get(f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip", verify=False)
 
 
 
 
 
 
51
  if response.status_code != 200:
52
- raise Exception("Téléchargement du ZIP échoué")
53
-
54
  zip_bytes = io.BytesIO(response.content)
55
-
56
  with zipfile.ZipFile(zip_bytes) as zf:
57
  for file_name in zf.namelist():
58
- if file_name.endswith("doc") or file_name.endswith("docx"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ext = file_name.split(".")[-1]
60
  doc_bytes = zf.read(file_name)
61
  temp_id = str(uuid.uuid4())
62
  input_path = f"/tmp/{temp_id}.{ext}"
63
- output_path = f"/tmp/{temp_id}.pdf"
64
-
 
65
  with open(input_path, "wb") as f:
66
  f.write(doc_bytes)
67
-
 
68
  subprocess.run([
69
  "libreoffice",
70
  "--headless",
71
- "--convert-to", "pdf",
72
  "--outdir", "/tmp",
73
  input_path
74
  ], check=True)
75
-
76
- with open(output_path, "rb") as f:
77
- pdf_data = f.read()
 
78
 
79
  os.remove(input_path)
80
  os.remove(output_path)
81
-
82
- return io.BytesIO(pdf_data)
83
- raise Exception("Aucun fichier .doc/.docx trouvé dans le ZIP")
84
 
85
  def get_scope(specification: str, version: str):
86
- pdf_bytes = get_pdf_bytes(specification, version)
87
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
88
-
89
- for content in doc.get_toc():
90
- if "scope" in content[1].lower():
91
- page_num = content[2] - 1
92
- break
93
-
94
- doc = doc[page_num:]
95
-
96
- pdf_full_text = " ".join(page.get_text("text") for page in doc)
97
- pdf_postprocess_text = re.sub(r"\s+", " ", pdf_full_text)
98
- pdf_postprocess_text = pdf_postprocess_text.replace("1 Scope", " !-! ")
99
- pdf_postprocess_text = pdf_postprocess_text.replace("2 Reference", " !-! ")
100
- pdf_postprocess_text = pdf_postprocess_text.replace("", "- ")
101
-
102
- return pdf_postprocess_text.split(" !-! ")[1]
 
 
 
103
 
104
  class DocRequest(BaseModel):
105
  doc_id: str
 
5
  from bs4 import BeautifulSoup
6
  import json
7
  import os
8
+ import traceback
9
  import uuid
10
  import zipfile
11
  import io
 
44
  allow_headers=["*"],
45
  )
46
 
47
+ def get_text(specification: str, version: str):
48
+ """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
49
  doc_id = specification
50
  series = doc_id.split(".")[0]
51
+
52
+ response = requests.get(
53
+ f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
54
+ verify=False,
55
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
56
+ )
57
+
58
  if response.status_code != 200:
59
+ raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
60
+
61
  zip_bytes = io.BytesIO(response.content)
62
+
63
  with zipfile.ZipFile(zip_bytes) as zf:
64
  for file_name in zf.namelist():
65
+ if file_name.endswith("zip"):
66
+ print("Another ZIP !")
67
+ zip_bytes = io.BytesIO(zf.read(file_name))
68
+ zf = zipfile.ZipFile(zip_bytes)
69
+ for file_name2 in zf.namelist():
70
+ if file_name2.endswith("doc") or file_name2.endswith("docx"):
71
+ if "cover" in file_name2.lower():
72
+ print("COVER !")
73
+ continue
74
+ ext = file_name2.split(".")[-1]
75
+ doc_bytes = zf.read(file_name2)
76
+ temp_id = str(uuid.uuid4())
77
+ input_path = f"/tmp/{temp_id}.{ext}"
78
+ output_path = f"/tmp/{temp_id}.txt"
79
+
80
+ with open(input_path, "wb") as f:
81
+ f.write(doc_bytes)
82
+
83
+ subprocess.run([
84
+ "libreoffice",
85
+ "--headless",
86
+ "--convert-to", "txt",
87
+ "--outdir", "/tmp",
88
+ input_path
89
+ ], check=True)
90
+
91
+ with open(output_path, "r") as f:
92
+ txt_data = [line.strip() for line in f if line.strip()]
93
+
94
+ os.remove(input_path)
95
+ os.remove(output_path)
96
+ return txt_data
97
+ elif file_name.endswith("doc") or file_name.endswith("docx"):
98
+ if "cover" in file_name.lower():
99
+ print("COVER !")
100
+ continue
101
  ext = file_name.split(".")[-1]
102
  doc_bytes = zf.read(file_name)
103
  temp_id = str(uuid.uuid4())
104
  input_path = f"/tmp/{temp_id}.{ext}"
105
+ output_path = f"/tmp/{temp_id}.txt"
106
+
107
+ print("Ecriture")
108
  with open(input_path, "wb") as f:
109
  f.write(doc_bytes)
110
+
111
+ print("Convertissement")
112
  subprocess.run([
113
  "libreoffice",
114
  "--headless",
115
+ "--convert-to", "txt",
116
  "--outdir", "/tmp",
117
  input_path
118
  ], check=True)
119
+
120
+ print("Ecriture TXT")
121
+ with open(output_path, "r", encoding="utf-8") as f:
122
+ txt_data = [line.strip() for line in f if line.strip()]
123
 
124
  os.remove(input_path)
125
  os.remove(output_path)
126
+ return txt_data
127
+
128
+ raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
129
 
130
  def get_scope(specification: str, version: str):
131
+ try:
132
+ spec_text = get_text(specification, version)
133
+ scp_i = 0
134
+ for x in range(len(spec_text)):
135
+ text = spec_text[x]
136
+ if re.search(r"^\d\s+scope$", text.lower()):
137
+ scp_i = x
138
+ break
139
+
140
+ nxt_i = scp_i + 10
141
+ for x in range(len(spec_text[scp_i:])):
142
+ text = spec_text[x]
143
+ if re.search(r"^\d\s+references$", text.lower()):
144
+ nxt_i = x
145
+ break
146
+
147
+ return re.sub(r"\s+", " ", " ".join(spec_text[scp_i+1:nxt_i]))
148
+ except Exception as e:
149
+ traceback.print_exception(e)
150
+ return f"Erreur lors de l'extraction du scope: {str(e)}"
151
 
152
  class DocRequest(BaseModel):
153
  doc_id: str