File size: 2,344 Bytes
fe36699
cf10f44
61157d2
92757b3
 
fe36699
61157d2
06fdfd7
fe36699
cf10f44
 
20218cb
cf10f44
 
 
 
 
fe36699
cf10f44
 
20218cb
cf10f44
 
 
 
 
 
 
fe36699
cf10f44
61157d2
20218cb
61157d2
fe36699
cf10f44
 
20218cb
cf10f44
61157d2
cf10f44
61157d2
 
cf10f44
61157d2
cf10f44
fe36699
61157d2
cf10f44
fe36699
61157d2
 
 
 
 
 
cf10f44
fe36699
61157d2
 
fe36699
cf10f44
 
 
 
 
61157d2
fe36699
 
cf10f44
5f6bee3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import pdfplumber
import together
import re
import unicodedata

# Set up Together.AI API Key (Replace with your actual key)
together.api_key = "8052996318422f1b9470840fc6ebc94e80676391f07e71cc15951b08bb430240"

def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    print("cleaning")
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    print("extracting")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for better processing."""
    print("splitting")
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def chatbot(pdf_file, user_question):
    """Processes the PDF and answers the user's question."""
    print("chatbot start")
    
    # Extract text from the PDF
    text = extract_text_from_pdf(pdf_file)
    if not text:
        return "Could not extract any text from the PDF."
    
    # Split into smaller chunks
    chunks = split_text(text)

    # Use only the first chunk (to optimize token usage)
    prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"

    # Send to Together.AI (Mistral-7B)
    response = together.Completion.create(
        model="mistralai/Mistral-7B-Instruct-v0.1",
        prompt=prompt,
        max_tokens=200,
        temperature=0.7,
    )

    # Return chatbot's response
    return response["choices"][0]["text"]

# Gradio Interface
iface = gr.Interface(
    fn=chatbot,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
    outputs=gr.Textbox(label="Answer"),
    title="PDF Q&A Chatbot (Powered by Together.AI)"
)

# Launch Gradio app
iface.launch()