Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from PyPDF2 import PdfReader
|
4 |
+
|
5 |
+
# Funktion zum Extrahieren von Text aus PDF-Dateien
|
6 |
+
def extract_text_from_pdf(pdf_path):
|
7 |
+
reader = PdfReader(pdf_path)
|
8 |
+
text = ""
|
9 |
+
for page in reader.pages:
|
10 |
+
text += page.extract_text()
|
11 |
+
return text
|
12 |
+
|
13 |
+
# Funktion zum Extrahieren von Text aus TXT-Dateien
|
14 |
+
def extract_text_from_txt(txt_path):
|
15 |
+
with open(txt_path, "r", encoding="utf-8") as file:
|
16 |
+
return file.read()
|
17 |
+
|
18 |
+
# Pfad zu den Dateien im Hugging Face Space
|
19 |
+
pdf_files = ["doc1.pdf", "doc2.pdf", "doc3.pdf", "doc4.pdf", "doc5.pdf"]
|
20 |
+
txt_files = ["doc6.txt", "doc7.txt", "doc8.txt"]
|
21 |
+
|
22 |
+
# Liste zur Speicherung der Dokumente
|
23 |
+
documents = []
|
24 |
+
|
25 |
+
# PDF-Dateien verarbeiten
|
26 |
+
for pdf_file in pdf_files:
|
27 |
+
if os.path.exists(pdf_file):
|
28 |
+
content = extract_text_from_pdf(pdf_file)
|
29 |
+
documents.append({"id": len(documents) + 1, "content": content})
|
30 |
+
|
31 |
+
# TXT-Dateien verarbeiten
|
32 |
+
for txt_file in txt_files:
|
33 |
+
if os.path.exists(txt_file):
|
34 |
+
content = extract_text_from_txt(txt_file)
|
35 |
+
documents.append({"id": len(documents) + 1, "content": content})
|
36 |
+
|
37 |
+
# Dokumente in eine JSON-Datei speichern
|
38 |
+
with open("documents.json", "w", encoding="utf-8") as json_file:
|
39 |
+
json.dump(documents, json_file, ensure_ascii=False, indent=4)
|
40 |
+
|
41 |
+
print("documents.json wurde erfolgreich erstellt.")
|