Spaces:
Sleeping
Sleeping
File size: 2,776 Bytes
3d00632 e4261d6 3d00632 e4261d6 4c4e926 e4261d6 3d00632 e4261d6 3d00632 e4261d6 3d00632 1842c48 3d00632 3d70771 e4261d6 3d00632 7dbc572 3d00632 e4261d6 3d00632 1842c48 e4261d6 1842c48 3d00632 1842c48 3d70771 3d00632 3d70771 3d00632 67be4ed 3d00632 97c8253 3d70771 3d00632 7dbc572 e4261d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import pandas as pd
import PyPDF2 # For PDF extraction
import spacy
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
import torch
from transformers import AutoTokenizer, AutoModel
import gradio as gr
# Load and preprocess PDF text
def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
# Extract text from the PDF
pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf') # Replace with your PDF path
# Convert the text to a DataFrame
df = pd.DataFrame({'text': [pdf_text]})
# Load the custom embedding model
class CustomEmbeddingModel:
def __init__(self, model_name):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
def embed_text(self, text):
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
return embeddings[0].numpy()
embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
# Load Spacy model for preprocessing
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
doc = nlp(text)
tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha]
return ' '.join(tokens)
# Apply preprocessing and embedding
df['text'] = df['text'].apply(preprocess_text)
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
# Create FAISS vector store
documents = df['text'].tolist()
embeddings = df['text_embeddings'].tolist()
vector_store = FAISS.from_documents(documents, embeddings)
# Create LangChain model and chain
llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
retriever = vector_store.as_retriever()
chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
# Function to generate a response
def generate_response(prompt):
result = chain({"query": prompt})
response = result["result"]
return response
# Gradio interface
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
outputs=gr.Textbox(label="Response"),
title="Ubuntu Manual Chatbot",
description="Ask questions about the Ubuntu manual."
)
if __name__ == "__main__":
iface.launch()
|