Spaces:
Sleeping
Sleeping
File size: 3,575 Bytes
3d00632 4c4e926 3d00632 4c4e926 3d00632 4c4e926 3d00632 4c4e926 3d00632 1842c48 3d00632 3d70771 4c4e926 3d00632 7dbc572 3d00632 4c4e926 3d00632 1842c48 7dbc572 4c4e926 1842c48 3d00632 1842c48 3d70771 3d00632 3d70771 3d00632 67be4ed 3d00632 97c8253 3d70771 3d00632 7dbc572 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import pandas as pd
import fitz # PyMuPDF for PDF extraction
import spacy
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
import torch
import gradio as gr
import numpy as np
from faiss import IndexFlatL2, normalize_L2
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
# Load and preprocess PDF text
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as pdf_document:
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text += page.get_text()
return text
# Extract text from the PDF
pdf_path = 'Getting_Started_with_Ubuntu_16.04.pdf' # Reference to the PDF file in the same directory
pdf_text = extract_text_from_pdf(pdf_path)
# Convert the text to a DataFrame
df = pd.DataFrame({'text': [pdf_text]})
# Load the custom embedding model
class CustomEmbeddingModel:
def __init__(self, model_name):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
def embed_text(self, text):
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
return embeddings[0].numpy()
embedding_model = CustomEmbeddingModel('FridayMaster/fine_tune_embedding') # Replace with your model name
# Load Spacy model for preprocessing
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
doc = nlp(text)
tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha]
return ' '.join(tokens)
# Apply preprocessing and embedding
df['text'] = df['text'].apply(preprocess_text)
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
# Create FAISS vector store
class SimpleFAISSIndex:
def __init__(self, embeddings):
self.index = IndexFlatL2(embeddings.shape[1])
normalize_L2(embeddings)
self.index.add(embeddings)
def search(self, query_embedding, k=1):
normalize_L2(query_embedding)
distances, indices = self.index.search(query_embedding, k)
return indices[0], distances[0]
embeddings = np.array(df['text_embeddings'].tolist())
vector_store = SimpleFAISSIndex(embeddings)
# Create LangChain model and chain
llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
class SimpleRetriever:
def __init__(self, vector_store, documents):
self.vector_store = vector_store
self.documents = documents
def retrieve(self, query):
query_embedding = embedding_model.embed_text(query).reshape(1, -1)
indices, _ = self.vector_store.search(query_embedding)
return [self.documents[idx] for idx in indices]
retriever = SimpleRetriever(vector_store, df['text'].tolist())
chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
# Function to generate a response
def generate_response(prompt):
result = chain({"query": prompt})
response = result["result"]
return response
# Gradio interface
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
outputs=gr.Textbox(label="Response"),
title="Ubuntu Manual Chatbot",
description="Ask questions about the Ubuntu manual."
)
if __name__ == "__main__":
iface.launch()
|