Spaces:
Running
Running
File size: 2,344 Bytes
fe36699 cf10f44 61157d2 92757b3 fe36699 61157d2 06fdfd7 fe36699 cf10f44 20218cb cf10f44 fe36699 cf10f44 20218cb cf10f44 fe36699 cf10f44 61157d2 20218cb 61157d2 fe36699 cf10f44 20218cb cf10f44 61157d2 cf10f44 61157d2 cf10f44 61157d2 cf10f44 fe36699 61157d2 cf10f44 fe36699 61157d2 cf10f44 fe36699 61157d2 fe36699 cf10f44 61157d2 fe36699 cf10f44 5f6bee3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
import pdfplumber
import together
import re
import unicodedata
# Set up Together.AI API Key (Replace with your actual key)
together.api_key = "8052996318422f1b9470840fc6ebc94e80676391f07e71cc15951b08bb430240"
def clean_text(text):
"""Cleans extracted text for better processing by the model."""
print("cleaning")
text = unicodedata.normalize("NFKC", text) # Normalize Unicode characters
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces and newlines
text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text) # Keep basic punctuation
text = re.sub(r'(?i)(page\s*\d+)', '', text) # Remove page numbers
return text
def extract_text_from_pdf(pdf_file):
"""Extract and clean text from the uploaded PDF."""
print("extracting")
try:
with pdfplumber.open(pdf_file) as pdf:
text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
return text
except Exception as e:
print(f"Error extracting text: {e}")
return None
def split_text(text, chunk_size=500):
"""Splits text into smaller chunks for better processing."""
print("splitting")
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def chatbot(pdf_file, user_question):
"""Processes the PDF and answers the user's question."""
print("chatbot start")
# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)
if not text:
return "Could not extract any text from the PDF."
# Split into smaller chunks
chunks = split_text(text)
# Use only the first chunk (to optimize token usage)
prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
# Send to Together.AI (Mistral-7B)
response = together.Completion.create(
model="mistralai/Mistral-7B-Instruct-v0.1",
prompt=prompt,
max_tokens=200,
temperature=0.7,
)
# Return chatbot's response
return response["choices"][0]["text"]
# Gradio Interface
iface = gr.Interface(
fn=chatbot,
inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
outputs=gr.Textbox(label="Answer"),
title="PDF Q&A Chatbot (Powered by Together.AI)"
)
# Launch Gradio app
iface.launch()
|