File size: 4,449 Bytes
1649416
24d9947
944d263
24d9947
834c71a
24d9947
 
3ac4e4b
24d9947
 
 
3ac4e4b
24d9947
3ac4e4b
 
 
 
 
 
 
 
 
 
 
24d9947
 
3ac4e4b
24d9947
 
 
 
 
3ac4e4b
24d9947
 
 
 
 
 
 
3ac4e4b
 
24d9947
3ac4e4b
56ec544
24d9947
 
 
944d263
 
 
 
24d9947
944d263
 
24d9947
944d263
834c71a
944d263
 
 
 
 
 
24d9947
944d263
 
24d9947
 
 
7adb197
944d263
90bf4dc
7adb197
ac5f15c
944d263
 
 
 
7adb197
24d9947
7adb197
3ac4e4b
 
 
 
 
 
ac5f15c
944d263
24d9947
944d263
 
 
 
ac5f15c
24d9947
90bf4dc
24d9947
944d263
24d9947
944d263
3ac4e4b
 
 
24d9947
944d263
24d9947
 
 
 
944d263
24d9947
944d263
24d9947
 
 
944d263
24d9947
944d263
24d9947
 
 
944d263
 
24d9947
 
 
 
 
 
 
944d263
3ac4e4b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import sys
import pickle
import numpy as np
import gradio as gr
import fitz  # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer
import faiss

# =============================================
# EMBEDDING MODEL SETUP (NO sentence-transformers dependency)
# =============================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts):
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    return outputs.last_hidden_state[:, 0].cpu().numpy()

# =============================================
# DOCUMENT STORAGE SETUP
# =============================================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []

embedding_dim = 384  # Dimension for all-MiniLM-L6-v2
if os.path.exists(index_path) and os.path.exists(document_texts_path):
    try:
        with open(index_path, "rb") as f:
            index = pickle.load(f)
        with open(document_texts_path, "rb") as f:
            document_texts = pickle.load(f)
    except Exception as e:
        print(f"Error loading index: {e}")
        index = faiss.IndexFlatIP(embedding_dim)
else:
    index = faiss.IndexFlatIP(embedding_dim)

# =============================================
# DOCUMENT PROCESSING FUNCTIONS
# =============================================
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"PDF error: {e}")
    return text

def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"DOCX error: {e}")
    return text

# =============================================
# CORE FUNCTIONALITY
# =============================================
def upload_files(files):
    global index, document_texts
    try:
        for file in files:
            file_path = file.name
            if file_path.endswith('.pdf'):
                text = extract_text_from_pdf(file_path)
            elif file_path.endswith('.docx'):
                text = extract_text_from_docx(file_path)
            else:
                continue

            sentences = [s.strip() for s in text.split("\n") if s.strip()]
            if not sentences:
                continue
                
            embeddings = get_embeddings(sentences)
            index.add(embeddings)
            document_texts.extend(sentences)

        # Save updated index
        with open(index_path, "wb") as f:
            pickle.dump(index, f)
        with open(document_texts_path, "wb") as f:
            pickle.dump(document_texts, f)

        return f"Processed {len(files)} files, added {len(sentences)} sentences"
    except Exception as e:
        return f"Error: {str(e)}"

def query_text(query):
    try:
        query_embedding = get_embeddings(query)
        D, I = index.search(query_embedding, k=3)
        
        results = []
        for idx in I[0]:
            if 0 <= idx < len(document_texts):
                results.append(document_texts[idx])
        
        return "\n\n---\n\n".join(results) if results else "No matches found"
    except Exception as e:
        return f"Query error: {str(e)}"

# =============================================
# GRADIO INTERFACE
# =============================================
with gr.Blocks() as demo:
    gr.Markdown("## Document Search with Semantic Similarity")
    
    with gr.Tab("Upload Documents"):
        file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
        upload_btn = gr.Button("Process Files")
        upload_output = gr.Textbox()
    
    with gr.Tab("Search"):
        query_input = gr.Textbox(label="Enter your query")
        search_btn = gr.Button("Search")
        results_output = gr.Textbox()
    
    upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
    search_btn.click(query_text, inputs=query_input, outputs=results_output)

if __name__ == "__main__":
    demo.launch()