Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,20 +1,34 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
-
from langchain.vectorstores import Chroma
|
|
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.chains import RetrievalQA
|
6 |
from langchain.prompts import PromptTemplate
|
7 |
-
from
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
|
|
18 |
|
19 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
20 |
vectordb = Chroma.from_documents(documents, embeddings)
|
@@ -29,7 +43,7 @@ def process_pdf_and_query(pdf_path, question):
|
|
29 |
|
30 |
def chatbot_response(pdf, question):
|
31 |
# Gradio gibt uns die PDF als NamedString, wir extrahieren den Inhalt als Byte-Stream
|
32 |
-
pdf_path = "/
|
33 |
|
34 |
# Extrahiere den Inhalt der Datei als Bytes
|
35 |
pdf_content = pdf.read() # Hier holen wir den Inhalt der PDF als Byte-Stream
|
@@ -38,12 +52,12 @@ def chatbot_response(pdf, question):
|
|
38 |
with open(pdf_path, "wb") as f:
|
39 |
f.write(pdf_content)
|
40 |
|
41 |
-
#
|
|
|
|
|
|
|
42 |
answer = process_pdf_and_query(pdf_path, question)
|
43 |
|
44 |
-
# Temporäre Datei löschen
|
45 |
-
os.remove(pdf_path)
|
46 |
-
|
47 |
return answer
|
48 |
|
49 |
# Gradio Interface
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
+
from langchain.vectorstores import Chroma
|
4 |
+
from langchain.document_loaders import PyPDFLoader # Korrekt importiert
|
5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
from langchain.chains import RetrievalQA
|
7 |
from langchain.prompts import PromptTemplate
|
8 |
+
from pdf2image import convert_from_path
|
9 |
+
from transformers import LayoutLMv3Processor, AutoModelForTokenClassification
|
10 |
|
11 |
+
# OCR-Modell einrichten
|
12 |
+
class LayoutLMv3OCR:
|
13 |
+
def __init__(self):
|
14 |
+
self.processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
|
15 |
+
self.model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
|
16 |
+
|
17 |
+
def extract_text(self, pdf_path):
|
18 |
+
images = convert_from_path(pdf_path)
|
19 |
+
text_pages = []
|
20 |
+
for image in images:
|
21 |
+
inputs = self.processor(images=image, return_tensors="pt")
|
22 |
+
outputs = self.model(**inputs)
|
23 |
+
text = self.processor.batch_decode(outputs.logits, skip_special_tokens=True)[0]
|
24 |
+
text_pages.append(text)
|
25 |
+
return text_pages
|
26 |
+
|
27 |
+
ocr_tool = LayoutLMv3OCR()
|
28 |
|
29 |
+
def process_pdf_and_query(pdf_path, question):
|
30 |
+
loader = PyPDFLoader(pdf_path)
|
31 |
+
documents = loader.load()
|
32 |
|
33 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
34 |
vectordb = Chroma.from_documents(documents, embeddings)
|
|
|
43 |
|
44 |
def chatbot_response(pdf, question):
|
45 |
# Gradio gibt uns die PDF als NamedString, wir extrahieren den Inhalt als Byte-Stream
|
46 |
+
pdf_path = "/mnt/data/uploaded_pdf.pdf" # Verwendet das persistente Verzeichnis von Hugging Face Spaces
|
47 |
|
48 |
# Extrahiere den Inhalt der Datei als Bytes
|
49 |
pdf_content = pdf.read() # Hier holen wir den Inhalt der PDF als Byte-Stream
|
|
|
52 |
with open(pdf_path, "wb") as f:
|
53 |
f.write(pdf_content)
|
54 |
|
55 |
+
# OCR-Text extrahieren
|
56 |
+
extracted_text = ocr_tool.extract_text(pdf_path)
|
57 |
+
|
58 |
+
# Frage beantworten basierend auf der PDF und OCR-Inhalten
|
59 |
answer = process_pdf_and_query(pdf_path, question)
|
60 |
|
|
|
|
|
|
|
61 |
return answer
|
62 |
|
63 |
# Gradio Interface
|