resinbot / app.py
bluemiracle0214's picture
Update app.py
a9425bd verified
import os
import fitz # PyMuPDF
import openai
import gradio as gr
# OpenAI API ํ‚ค ์„ค์ •
openai.api_key = os.getenv('OPENAI_API_KEY')
# Define the Google Drive folder path where PDF files are stored
drive_folder_path = '/home/user/app/Resin'
def extract_text_from_pdf(pdf_path, max_chars=5000):
text = ""
try:
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text() # ํŽ˜์ด์ง€์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ
if len(text) > max_chars:
break # ์ตœ๋Œ€ ํ…์ŠคํŠธ ๊ธธ์ด๋ฅผ ๋„˜์œผ๋ฉด ์ค‘์ง€
except Exception as e:
print(f"Error extracting text from {pdf_path}: {e}")
return text[:max_chars] # ํ…์ŠคํŠธ ๊ธธ์ด ์ œํ•œ
def preprocess_text(text):
clean_text = ' '.join(text.split()) # ๋ถˆํ•„์š”ํ•œ ๊ณต๋ฐฑ๊ณผ ์ค„๋ฐ”๊ฟˆ์„ ์ œ๊ฑฐ
return clean_text
def extract_texts_from_drive_folder(folder_path):
all_texts = {}
pdf_files = []
for filename in os.listdir(folder_path):
if filename.endswith(".pdf"):
file_path = os.path.join(folder_path, filename)
pdf_files.append(filename) # ํŒŒ์ผ ์ด๋ฆ„์„ ๋ชฉ๋ก์— ์ถ”๊ฐ€
print(f"Processing file: {file_path}")
try:
text = extract_text_from_pdf(file_path)
all_texts[filename] = text
except Exception as e:
print(f"Error processing file {file_path}: {e}")
return all_texts, pdf_files
def evaluate_relevance(text, question):
try:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Please evaluate how relevant the following text is to the question from 1 to 10, return only the number"},
{"role": "user", "content": "Text: {}, Question: {}".format(text, question)}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=1024,
temperature=0.7
)
relevance = response.choices[0].message['content'].strip()
total_tokens = response['usage']['total_tokens']
return relevance, total_tokens
except Exception as e:
return f"Error occurred: {str(e)}"
def ask_gpt_based_on_single_pdf(text, question):
try:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Based on the following content, please answer the question: {}, Question: {}".format(text, question)}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=2048,
temperature=0.7
)
gpt_response = response.choices[0].message['content'].strip()
total_tokens = response['usage']['total_tokens']
return gpt_response, total_tokens
except Exception as e:
return f"Error occurred: {str(e)}"
def ask_gpt_based_on_pdfs_from_drive(question, relevance_threshold=3, max_chars=8000):
try:
pdf_texts, pdf_files = extract_texts_from_drive_folder(drive_folder_path)
relevant_texts = []
relevant_files = []
total_tokens_used = 0 # ์ด ์‚ฌ์šฉ๋œ ํ† ํฐ ์ˆ˜๋ฅผ ์ถ”์ 
# ๊ฐ PDF ํŒŒ์ผ์˜ ํ…์ŠคํŠธ ๊ด€๋ จ์„ฑ ํ‰๊ฐ€
for filename, text in pdf_texts.items():
print(f"Evaluating relevance of {filename}...")
text = preprocess_text(text[:max_chars]) # ํ…์ŠคํŠธ ๊ธธ์ด ์ œํ•œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์ ์šฉ
relevance, tokens_used = evaluate_relevance(text, question)
total_tokens_used += tokens_used
print(f"{filename} ๊ด€๋ จ์„ฑ: {relevance}")
# ์‚ฌ์šฉ์ž๊ฐ€ ์„ค์ •ํ•œ ์ž„๊ณ„์น˜๋ณด๋‹ค ๋†’์œผ๋ฉด ์ถ”๊ฐ€
try:
relevance_score = float(relevance) # ์ˆซ์ž๋กœ ๋ณ€ํ™˜
if relevance_score >= relevance_threshold:
relevant_texts.append(text)
relevant_files.append(filename)
except ValueError:
print(f"Relevance ํ‰๊ฐ€์— ์˜ค๋ฅ˜๊ฐ€ ์žˆ์Œ: {relevance}")
if relevant_texts:
combined_text = " ".join(relevant_texts) # ์ค„๋ฐ”๊ฟˆ ์—†์ด ๊ณต๋ฐฑ์œผ๋กœ ์—ฐ๊ฒฐ
else:
return "No PDF files found with sufficient relevance.", total_tokens_used
# GPT์—๊ฒŒ ์ตœ์ข… ๊ฒฐํ•ฉ๋œ ํ…์ŠคํŠธ๋กœ ์งˆ๋ฌธ
final_response, conclusion_tokens = ask_gpt_based_on_single_pdf(combined_text[:max_chars], question)
relevant_file_count = len(relevant_files)
total_tokens_used += conclusion_tokens
return (
f"Final GPT response: {final_response}\n"
f"Relevant PDF files: {relevant_file_count}\n",
total_tokens_used
)
#return "Final GPT response: " + final_response + "\n" + f"Relevant PDF files: {relevant_file_count}" + "\n" + total_tokens_used
except Exception as e:
return f"Error occurred: {str(e)}", 0
def list_pdfs_in_folder():
_, pdf_files = extract_texts_from_drive_folder(drive_folder_path)
return f"Number of PDF files in the folder: {len(pdf_files)}"
def answer_question_with_gpt(question):
response, total_tokens_used = ask_gpt_based_on_pdfs_from_drive(question)
return f"{response}Total tokens used: {total_tokens_used}"
interface = gr.Interface(
fn=answer_question_with_gpt,
inputs=[gr.Textbox(label="Enter your question")],
outputs=gr.Textbox(label="GPT response and relevant files"),
title="Chromatography Resin Bot",
description=list_pdfs_in_folder()
)
interface.launch()