NaimaAqeel commited on
Commit
f7133fb
·
verified ·
1 Parent(s): f812db9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -43
app.py CHANGED
@@ -17,9 +17,17 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
17
  def extract_text_from_pdf(pdf_path):
18
  # ...
19
 
20
- # Function to extract text from a Word document (same as before)
21
  def extract_text_from_docx(docx_path):
22
- # ...
 
 
 
 
 
 
 
 
23
 
24
  # Initialize the embedding model (same as before)
25
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -50,14 +58,19 @@ document_texts = []
50
 
51
 
52
  def preprocess_text(text):
53
- # ... (text preprocessing logic, same as before)
54
 
55
 
56
  def upload_files(files):
57
  global index, document_texts
58
  try:
59
  for file_path in files:
60
- # ... (file processing logic, same as before)
 
 
 
 
 
61
 
62
  # Preprocess text (call the new function)
63
  sentences = preprocess_text(text)
@@ -66,48 +79,11 @@ def upload_files(files):
66
  embeddings = embedding_model.encode(sentences)
67
  index.add(np.array(embeddings))
68
 
69
- # Save the updated index and documents (same as before)
70
- # ...
71
  return "Files processed successfully"
72
  except Exception as e:
73
- print(f"Error processing files: {e}")
74
- return f"Error processing files: {e}"
75
-
76
-
77
- def query_text(text):
78
- try:
79
- # Preprocess query text
80
- query_sentences = preprocess_text(text)
81
- query_embeddings = embedding_model.encode(query_sentences)
82
-
83
- # Retrieve relevant documents using FAISS
84
- D, I = index.search(np.array(query_embeddings), k=5)
85
- retrieved_docs = [document_texts[idx] for idx in I[0] if idx != -1]
86
-
87
- # Retriever-Augmented Generation (RAG)
88
- retriever_inputs = retriever_tokenizer(
89
- text=retrieved_docs, return_tensors="pt", padding=True
90
- )
91
- retriever_outputs = retriever(**retriever_inputs)
92
- retrieved_texts = retriever_tokenizer.batch_decode(retriever_outputs.logits)
93
-
94
- # Generate response using retrieved information (as prompts/context)
95
- generator_inputs = generator_tokenizer(
96
- text=[text] + retrieved_texts, return_tensors="pt", padding=True
97
- )
98
- generator_outputs = generator(**generator_inputs)
99
- response = generator_tokenizer.decode(generator_outputs.sequences[0], skip_special_tokens=True)
100
-
101
- return response
102
- except Exception as e:
103
- print(f"Error querying text: {e}")
104
- return f"Error querying text: {e}"
105
-
106
 
107
- # Create Gradio interface
108
- with gr.Blocks() as demo:
109
- # ... (rest of the Gradio interface definition)
110
- query_button.click(fn=query_text, inputs
111
 
112
 
113
 
 
17
  def extract_text_from_pdf(pdf_path):
18
  # ...
19
 
20
+ # Function to extract text from a Word document (fixed indentation)
21
  def extract_text_from_docx(docx_path):
22
+ """Extracts text from a Word document."""
23
+ text = ""
24
+ try:
25
+ doc = Document(docx_path)
26
+ text = "\n".join([para.text for para in doc.paragraphs])
27
+ except Exception as e:
28
+ print(f"Error extracting text from DOCX: {e}")
29
+ return text
30
+
31
 
32
  # Initialize the embedding model (same as before)
33
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 
58
 
59
 
60
  def preprocess_text(text):
61
+ # ... (text preprocessing logic, e.g., sentence segmentation and optional stop word removal)
62
 
63
 
64
  def upload_files(files):
65
  global index, document_texts
66
  try:
67
  for file_path in files:
68
+ if file_path.endswith('.pdf'):
69
+ text = extract_text_from_pdf(file_path)
70
+ elif file_path.endswith('.docx'):
71
+ text = extract_text_from_docx(file_path)
72
+ else:
73
+ return "Unsupported file format"
74
 
75
  # Preprocess text (call the new function)
76
  sentences = preprocess_text(text)
 
79
  embeddings = embedding_model.encode(sentences)
80
  index.add(np.array(embeddings))
81
 
82
+ # Save the updated index and documents
 
83
  return "Files processed successfully"
84
  except Exception as e:
85
+ print(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
 
87
 
88
 
89