Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,16 +4,16 @@ import numpy as np
|
|
4 |
import PyPDF2
|
5 |
import io
|
6 |
from docx import Document
|
|
|
|
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from langchain_community.vectorstores import FAISS
|
9 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
10 |
-
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
11 |
-
from nltk.tokenize import sent_tokenize
|
12 |
-
import torch
|
13 |
import gradio as gr
|
14 |
import pickle
|
15 |
-
import nltk
|
16 |
|
|
|
|
|
17 |
nltk.download('punkt')
|
18 |
|
19 |
# Function to extract text from a PDF file
|
@@ -37,17 +37,17 @@ def extract_text_from_docx(docx_data):
|
|
37 |
print(f"Error extracting text from DOCX: {e}")
|
38 |
return text
|
39 |
|
40 |
-
# Initialize
|
41 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
42 |
|
43 |
-
# Hugging Face API token
|
44 |
api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
45 |
if not api_token:
|
46 |
raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
|
47 |
|
48 |
-
#
|
49 |
generator_model_name = "facebook/bart-base"
|
50 |
-
retriever_model_name = "facebook/bart-base"
|
51 |
generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
|
52 |
generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
|
53 |
retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
|
@@ -132,7 +132,7 @@ def process_and_query(state, question):
|
|
132 |
# Create Gradio interface
|
133 |
iface = gr.Interface(
|
134 |
fn=upload_files,
|
135 |
-
inputs=gr.inputs.
|
136 |
outputs="json",
|
137 |
live=True,
|
138 |
capture_session=True
|
|
|
4 |
import PyPDF2
|
5 |
import io
|
6 |
from docx import Document
|
7 |
+
from nltk.tokenize import sent_tokenize
|
8 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
9 |
from sentence_transformers import SentenceTransformer
|
10 |
from langchain_community.vectorstores import FAISS
|
11 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
|
|
|
|
12 |
import gradio as gr
|
13 |
import pickle
|
|
|
14 |
|
15 |
+
# Download NLTK punkt tokenizer if not already downloaded
|
16 |
+
import nltk
|
17 |
nltk.download('punkt')
|
18 |
|
19 |
# Function to extract text from a PDF file
|
|
|
37 |
print(f"Error extracting text from DOCX: {e}")
|
38 |
return text
|
39 |
|
40 |
+
# Initialize Sentence Transformer model for embeddings
|
41 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
42 |
|
43 |
+
# Initialize Hugging Face API token
|
44 |
api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
45 |
if not api_token:
|
46 |
raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
|
47 |
|
48 |
+
# Initialize RAG models from Hugging Face
|
49 |
generator_model_name = "facebook/bart-base"
|
50 |
+
retriever_model_name = "facebook/bart-base"
|
51 |
generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
|
52 |
generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
|
53 |
retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
|
|
|
132 |
# Create Gradio interface
|
133 |
iface = gr.Interface(
|
134 |
fn=upload_files,
|
135 |
+
inputs=gr.inputs.File(label="Upload PDF or DOCX file"),
|
136 |
outputs="json",
|
137 |
live=True,
|
138 |
capture_session=True
|