Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,17 +16,42 @@ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-b
|
|
16 |
vector_store = None
|
17 |
retriever = None
|
18 |
|
19 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
chunk_size = 1000 # Example chunk size
|
21 |
overlap = 100 # Example overlap
|
22 |
loader = UnstructuredLoader([filepath], chunk_size=chunk_size, overlap=overlap)
|
23 |
-
|
|
|
|
|
|
|
24 |
for doc in loader.lazy_load():
|
25 |
-
|
26 |
-
|
|
|
|
|
27 |
|
28 |
-
def extract_tables_from_pdf(filepath):
|
29 |
-
|
|
|
|
|
|
|
30 |
return [table.df.to_string(index=False) for table in tables]
|
31 |
|
32 |
def update_documents(text_input):
|
@@ -63,9 +88,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
63 |
response = rag_chain({"question": message, "chat_history": history})
|
64 |
return response['answer']
|
65 |
|
66 |
-
def upload_file(filepath):
|
67 |
-
text = extract_text_from_pdf(filepath)
|
68 |
-
tables = extract_tables_from_pdf(filepath)
|
69 |
|
70 |
# Update documents in the vector store
|
71 |
update_documents(text)
|
@@ -81,12 +106,13 @@ with demo:
|
|
81 |
with gr.Row():
|
82 |
u = gr.UploadButton("Upload a file", file_count="single")
|
83 |
d = gr.DownloadButton("Download the file", visible=False)
|
|
|
84 |
|
85 |
# Create a Textbox for the status message
|
86 |
status_output = gr.Textbox(label="Status", visible=True)
|
87 |
|
88 |
# Use the proper output components in the upload method
|
89 |
-
u.upload(upload_file, u, [u, d, status_output])
|
90 |
|
91 |
with gr.Row():
|
92 |
chat = gr.ChatInterface(
|
|
|
16 |
vector_store = None
|
17 |
retriever = None
|
18 |
|
19 |
+
def parse_page_input(page_input):
|
20 |
+
pages = set()
|
21 |
+
for part in page_input.split(","):
|
22 |
+
part = part.strip()
|
23 |
+
if '-' in part: # Handle ranges
|
24 |
+
start, end = part.split('-')
|
25 |
+
try:
|
26 |
+
pages.update(range(int(start), int(end) + 1))
|
27 |
+
except ValueError:
|
28 |
+
continue # Skip invalid ranges
|
29 |
+
else: # Handle individual pages
|
30 |
+
try:
|
31 |
+
pages.add(int(part))
|
32 |
+
except ValueError:
|
33 |
+
continue # Skip invalid page numbers
|
34 |
+
return sorted(pages) # Return a sorted list of pages
|
35 |
+
|
36 |
+
def extract_text_from_pdf(filepath, pages):
|
37 |
chunk_size = 1000 # Example chunk size
|
38 |
overlap = 100 # Example overlap
|
39 |
loader = UnstructuredLoader([filepath], chunk_size=chunk_size, overlap=overlap)
|
40 |
+
pages_to_load = parse_page_input(pages) # Parse the input for page numbers
|
41 |
+
|
42 |
+
# Filter pages according to user input
|
43 |
+
pages_data = []
|
44 |
for doc in loader.lazy_load():
|
45 |
+
if doc.page_number in pages_to_load: # Assuming doc.page_number exists
|
46 |
+
pages_data.append(doc.page_content)
|
47 |
+
|
48 |
+
return "\n".join(pages_data)
|
49 |
|
50 |
+
def extract_tables_from_pdf(filepath, pages):
|
51 |
+
if pages:
|
52 |
+
tables = camelot.read_pdf(filepath, pages=pages)
|
53 |
+
else:
|
54 |
+
tables = camelot.read_pdf(filepath, pages='1-end')
|
55 |
return [table.df.to_string(index=False) for table in tables]
|
56 |
|
57 |
def update_documents(text_input):
|
|
|
88 |
response = rag_chain({"question": message, "chat_history": history})
|
89 |
return response['answer']
|
90 |
|
91 |
+
def upload_file(filepath, pages):
|
92 |
+
text = extract_text_from_pdf(filepath, pages)
|
93 |
+
tables = extract_tables_from_pdf(filepath, pages)
|
94 |
|
95 |
# Update documents in the vector store
|
96 |
update_documents(text)
|
|
|
106 |
with gr.Row():
|
107 |
u = gr.UploadButton("Upload a file", file_count="single")
|
108 |
d = gr.DownloadButton("Download the file", visible=False)
|
109 |
+
page_input = gr.Textbox(label="Pages to Parse (e.g., 1, 2, 5-7)", placeholder="Enter page numbers or ranges")
|
110 |
|
111 |
# Create a Textbox for the status message
|
112 |
status_output = gr.Textbox(label="Status", visible=True)
|
113 |
|
114 |
# Use the proper output components in the upload method
|
115 |
+
u.upload(upload_file, [u, page_input], [u, d, status_output])
|
116 |
|
117 |
with gr.Row():
|
118 |
chat = gr.ChatInterface(
|