File size: 3,575 Bytes
3d00632
 
 
4c4e926
3d00632
4c4e926
3d00632
4c4e926
 
 
 
3d00632
 
 
 
 
 
 
 
 
 
 
4c4e926
 
3d00632
 
 
 
1842c48
3d00632
 
 
 
 
 
 
 
 
 
3d70771
4c4e926
3d00632
 
7dbc572
3d00632
 
 
 
 
 
 
 
 
 
 
4c4e926
 
 
 
 
 
 
 
 
 
 
 
 
3d00632
1842c48
 
7dbc572
4c4e926
 
 
 
 
 
 
 
 
 
 
1842c48
 
3d00632
1842c48
 
 
3d70771
 
3d00632
3d70771
 
3d00632
67be4ed
3d00632
 
97c8253
 
 
3d70771
3d00632
7dbc572
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import fitz  # PyMuPDF for PDF extraction
import spacy
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
import torch
import gradio as gr
import numpy as np
from faiss import IndexFlatL2, normalize_L2
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

# Load and preprocess PDF text
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
    return text

# Extract text from the PDF
pdf_path = 'Getting_Started_with_Ubuntu_16.04.pdf'  # Reference to the PDF file in the same directory
pdf_text = extract_text_from_pdf(pdf_path)

# Convert the text to a DataFrame
df = pd.DataFrame({'text': [pdf_text]})

# Load the custom embedding model
class CustomEmbeddingModel:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_text(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings[0].numpy()

embedding_model = CustomEmbeddingModel('FridayMaster/fine_tune_embedding')  # Replace with your model name

# Load Spacy model for preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha]
    return ' '.join(tokens)

# Apply preprocessing and embedding
df['text'] = df['text'].apply(preprocess_text)
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))

# Create FAISS vector store
class SimpleFAISSIndex:
    def __init__(self, embeddings):
        self.index = IndexFlatL2(embeddings.shape[1])
        normalize_L2(embeddings)
        self.index.add(embeddings)

    def search(self, query_embedding, k=1):
        normalize_L2(query_embedding)
        distances, indices = self.index.search(query_embedding, k)
        return indices[0], distances[0]

embeddings = np.array(df['text_embeddings'].tolist())
vector_store = SimpleFAISSIndex(embeddings)

# Create LangChain model and chain
llm_model = OpenAI('gpt-3.5-turbo')  # You can replace this with a different LLM if desired

class SimpleRetriever:
    def __init__(self, vector_store, documents):
        self.vector_store = vector_store
        self.documents = documents

    def retrieve(self, query):
        query_embedding = embedding_model.embed_text(query).reshape(1, -1)
        indices, _ = self.vector_store.search(query_embedding)
        return [self.documents[idx] for idx in indices]

retriever = SimpleRetriever(vector_store, df['text'].tolist())
chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)

# Function to generate a response
def generate_response(prompt):
    result = chain({"query": prompt})
    response = result["result"]
    return response

# Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
    outputs=gr.Textbox(label="Response"),
    title="Ubuntu Manual Chatbot",
    description="Ask questions about the Ubuntu manual."
)

if __name__ == "__main__":
    iface.launch()