raj999 commited on
Commit
af514b7
·
verified ·
1 Parent(s): 9ac8550

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -10
app.py CHANGED
@@ -16,17 +16,42 @@ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-b
16
  vector_store = None
17
  retriever = None
18
 
19
- def extract_text_from_pdf(filepath):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  chunk_size = 1000 # Example chunk size
21
  overlap = 100 # Example overlap
22
  loader = UnstructuredLoader([filepath], chunk_size=chunk_size, overlap=overlap)
23
- pages = []
 
 
 
24
  for doc in loader.lazy_load():
25
- pages.append(doc)
26
- return "\n".join([page.page_content for page in pages])
 
 
27
 
28
- def extract_tables_from_pdf(filepath):
29
- tables = camelot.read_pdf(filepath, pages='1-end')
 
 
 
30
  return [table.df.to_string(index=False) for table in tables]
31
 
32
  def update_documents(text_input):
@@ -63,9 +88,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
63
  response = rag_chain({"question": message, "chat_history": history})
64
  return response['answer']
65
 
66
- def upload_file(filepath):
67
- text = extract_text_from_pdf(filepath)
68
- tables = extract_tables_from_pdf(filepath)
69
 
70
  # Update documents in the vector store
71
  update_documents(text)
@@ -81,12 +106,13 @@ with demo:
81
  with gr.Row():
82
  u = gr.UploadButton("Upload a file", file_count="single")
83
  d = gr.DownloadButton("Download the file", visible=False)
 
84
 
85
  # Create a Textbox for the status message
86
  status_output = gr.Textbox(label="Status", visible=True)
87
 
88
  # Use the proper output components in the upload method
89
- u.upload(upload_file, u, [u, d, status_output])
90
 
91
  with gr.Row():
92
  chat = gr.ChatInterface(
 
16
  vector_store = None
17
  retriever = None
18
 
19
+ def parse_page_input(page_input):
20
+ pages = set()
21
+ for part in page_input.split(","):
22
+ part = part.strip()
23
+ if '-' in part: # Handle ranges
24
+ start, end = part.split('-')
25
+ try:
26
+ pages.update(range(int(start), int(end) + 1))
27
+ except ValueError:
28
+ continue # Skip invalid ranges
29
+ else: # Handle individual pages
30
+ try:
31
+ pages.add(int(part))
32
+ except ValueError:
33
+ continue # Skip invalid page numbers
34
+ return sorted(pages) # Return a sorted list of pages
35
+
36
+ def extract_text_from_pdf(filepath, pages):
37
  chunk_size = 1000 # Example chunk size
38
  overlap = 100 # Example overlap
39
  loader = UnstructuredLoader([filepath], chunk_size=chunk_size, overlap=overlap)
40
+ pages_to_load = parse_page_input(pages) # Parse the input for page numbers
41
+
42
+ # Filter pages according to user input
43
+ pages_data = []
44
  for doc in loader.lazy_load():
45
+ if doc.page_number in pages_to_load: # Assuming doc.page_number exists
46
+ pages_data.append(doc.page_content)
47
+
48
+ return "\n".join(pages_data)
49
 
50
+ def extract_tables_from_pdf(filepath, pages):
51
+ if pages:
52
+ tables = camelot.read_pdf(filepath, pages=pages)
53
+ else:
54
+ tables = camelot.read_pdf(filepath, pages='1-end')
55
  return [table.df.to_string(index=False) for table in tables]
56
 
57
  def update_documents(text_input):
 
88
  response = rag_chain({"question": message, "chat_history": history})
89
  return response['answer']
90
 
91
+ def upload_file(filepath, pages):
92
+ text = extract_text_from_pdf(filepath, pages)
93
+ tables = extract_tables_from_pdf(filepath, pages)
94
 
95
  # Update documents in the vector store
96
  update_documents(text)
 
106
  with gr.Row():
107
  u = gr.UploadButton("Upload a file", file_count="single")
108
  d = gr.DownloadButton("Download the file", visible=False)
109
+ page_input = gr.Textbox(label="Pages to Parse (e.g., 1, 2, 5-7)", placeholder="Enter page numbers or ranges")
110
 
111
  # Create a Textbox for the status message
112
  status_output = gr.Textbox(label="Status", visible=True)
113
 
114
  # Use the proper output components in the upload method
115
+ u.upload(upload_file, [u, page_input], [u, d, status_output])
116
 
117
  with gr.Row():
118
  chat = gr.ChatInterface(