NaimaAqeel commited on
Commit
3ee65d3
·
verified ·
1 Parent(s): 2b4be53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -4,16 +4,16 @@ import numpy as np
4
  import PyPDF2
5
  import io
6
  from docx import Document
 
 
7
  from sentence_transformers import SentenceTransformer
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
11
- from nltk.tokenize import sent_tokenize
12
- import torch
13
  import gradio as gr
14
  import pickle
15
- import nltk
16
 
 
 
17
  nltk.download('punkt')
18
 
19
  # Function to extract text from a PDF file
@@ -37,17 +37,17 @@ def extract_text_from_docx(docx_data):
37
  print(f"Error extracting text from DOCX: {e}")
38
  return text
39
 
40
- # Initialize the embedding model
41
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
42
 
43
- # Hugging Face API token
44
  api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
45
  if not api_token:
46
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
47
 
48
- # Define RAG models
49
  generator_model_name = "facebook/bart-base"
50
- retriever_model_name = "facebook/bart-base" # Can be the same as generator
51
  generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
52
  generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
53
  retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
@@ -132,7 +132,7 @@ def process_and_query(state, question):
132
  # Create Gradio interface
133
  iface = gr.Interface(
134
  fn=upload_files,
135
- inputs=gr.inputs.FileContent(label="Upload PDF or DOCX file"),
136
  outputs="json",
137
  live=True,
138
  capture_session=True
 
4
  import PyPDF2
5
  import io
6
  from docx import Document
7
+ from nltk.tokenize import sent_tokenize
8
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9
  from sentence_transformers import SentenceTransformer
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
 
 
12
  import gradio as gr
13
  import pickle
 
14
 
15
+ # Download NLTK punkt tokenizer if not already downloaded
16
+ import nltk
17
  nltk.download('punkt')
18
 
19
  # Function to extract text from a PDF file
 
37
  print(f"Error extracting text from DOCX: {e}")
38
  return text
39
 
40
+ # Initialize Sentence Transformer model for embeddings
41
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
42
 
43
+ # Initialize Hugging Face API token
44
  api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
45
  if not api_token:
46
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
47
 
48
+ # Initialize RAG models from Hugging Face
49
  generator_model_name = "facebook/bart-base"
50
+ retriever_model_name = "facebook/bart-base"
51
  generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
52
  generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
53
  retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
 
132
  # Create Gradio interface
133
  iface = gr.Interface(
134
  fn=upload_files,
135
+ inputs=gr.inputs.File(label="Upload PDF or DOCX file"),
136
  outputs="json",
137
  live=True,
138
  capture_session=True