File size: 2,776 Bytes
3d00632
e4261d6
3d00632
e4261d6
 
 
4c4e926
e4261d6
3d00632
 
 
 
 
e4261d6
 
 
 
 
3d00632
 
 
e4261d6
3d00632
 
 
 
1842c48
3d00632
 
 
 
 
 
 
 
 
 
3d70771
e4261d6
3d00632
 
7dbc572
3d00632
 
 
 
 
 
 
 
 
 
 
e4261d6
 
 
3d00632
1842c48
 
e4261d6
1842c48
 
3d00632
1842c48
 
 
3d70771
 
3d00632
3d70771
 
3d00632
67be4ed
3d00632
 
97c8253
 
 
3d70771
3d00632
7dbc572
e4261d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import PyPDF2  # For PDF extraction
import spacy
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
import torch
from transformers import AutoTokenizer, AutoModel
import gradio as gr

# Load and preprocess PDF text
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Extract text from the PDF
pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf')  # Replace with your PDF path

# Convert the text to a DataFrame
df = pd.DataFrame({'text': [pdf_text]})

# Load the custom embedding model
class CustomEmbeddingModel:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_text(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings[0].numpy()

embedding_model = CustomEmbeddingModel('distilbert-base-uncased')  # Replace with your model name

# Load Spacy model for preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha]
    return ' '.join(tokens)

# Apply preprocessing and embedding
df['text'] = df['text'].apply(preprocess_text)
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))

# Create FAISS vector store
documents = df['text'].tolist()
embeddings = df['text_embeddings'].tolist()
vector_store = FAISS.from_documents(documents, embeddings)

# Create LangChain model and chain
llm_model = OpenAI('gpt-3.5-turbo')  # You can replace this with a different LLM if desired
retriever = vector_store.as_retriever()
chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)

# Function to generate a response
def generate_response(prompt):
    result = chain({"query": prompt})
    response = result["result"]
    return response

# Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
    outputs=gr.Textbox(label="Response"),
    title="Ubuntu Manual Chatbot",
    description="Ask questions about the Ubuntu manual."
)

if __name__ == "__main__":
    iface.launch()