Spaces:
Sleeping
Sleeping
import os | |
import fitz # PyMuPDF | |
import openai | |
import gradio as gr | |
# OpenAI API ํค ์ค์ | |
openai.api_key = os.getenv('OPENAI_API_KEY') | |
# Define the Google Drive folder path where PDF files are stored | |
drive_folder_path = '/home/user/app/Resin' | |
def extract_text_from_pdf(pdf_path, max_chars=5000): | |
text = "" | |
try: | |
with fitz.open(pdf_path) as doc: | |
for page in doc: | |
text += page.get_text() # ํ์ด์ง์์ ํ ์คํธ ์ถ์ถ | |
if len(text) > max_chars: | |
break # ์ต๋ ํ ์คํธ ๊ธธ์ด๋ฅผ ๋์ผ๋ฉด ์ค์ง | |
except Exception as e: | |
print(f"Error extracting text from {pdf_path}: {e}") | |
return text[:max_chars] # ํ ์คํธ ๊ธธ์ด ์ ํ | |
def preprocess_text(text): | |
clean_text = ' '.join(text.split()) # ๋ถํ์ํ ๊ณต๋ฐฑ๊ณผ ์ค๋ฐ๊ฟ์ ์ ๊ฑฐ | |
return clean_text | |
def extract_texts_from_drive_folder(folder_path): | |
all_texts = {} | |
pdf_files = [] | |
for filename in os.listdir(folder_path): | |
if filename.endswith(".pdf"): | |
file_path = os.path.join(folder_path, filename) | |
pdf_files.append(filename) # ํ์ผ ์ด๋ฆ์ ๋ชฉ๋ก์ ์ถ๊ฐ | |
print(f"Processing file: {file_path}") | |
try: | |
text = extract_text_from_pdf(file_path) | |
all_texts[filename] = text | |
except Exception as e: | |
print(f"Error processing file {file_path}: {e}") | |
return all_texts, pdf_files | |
def evaluate_relevance(text, question): | |
try: | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Please evaluate how relevant the following text is to the question from 1 to 10, return only the number"}, | |
{"role": "user", "content": "Text: {}, Question: {}".format(text, question)} | |
] | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=messages, | |
max_tokens=1024, | |
temperature=0.7 | |
) | |
relevance = response.choices[0].message['content'].strip() | |
total_tokens = response['usage']['total_tokens'] | |
return relevance, total_tokens | |
except Exception as e: | |
return f"Error occurred: {str(e)}" | |
def ask_gpt_based_on_single_pdf(text, question): | |
try: | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Based on the following content, please answer the question: {}, Question: {}".format(text, question)} | |
] | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=messages, | |
max_tokens=2048, | |
temperature=0.7 | |
) | |
gpt_response = response.choices[0].message['content'].strip() | |
total_tokens = response['usage']['total_tokens'] | |
return gpt_response, total_tokens | |
except Exception as e: | |
return f"Error occurred: {str(e)}" | |
def ask_gpt_based_on_pdfs_from_drive(question, relevance_threshold=3, max_chars=8000): | |
try: | |
pdf_texts, pdf_files = extract_texts_from_drive_folder(drive_folder_path) | |
relevant_texts = [] | |
relevant_files = [] | |
total_tokens_used = 0 # ์ด ์ฌ์ฉ๋ ํ ํฐ ์๋ฅผ ์ถ์ | |
# ๊ฐ PDF ํ์ผ์ ํ ์คํธ ๊ด๋ จ์ฑ ํ๊ฐ | |
for filename, text in pdf_texts.items(): | |
print(f"Evaluating relevance of {filename}...") | |
text = preprocess_text(text[:max_chars]) # ํ ์คํธ ๊ธธ์ด ์ ํ ๋ฐ ์ ์ฒ๋ฆฌ ์ ์ฉ | |
relevance, tokens_used = evaluate_relevance(text, question) | |
total_tokens_used += tokens_used | |
print(f"{filename} ๊ด๋ จ์ฑ: {relevance}") | |
# ์ฌ์ฉ์๊ฐ ์ค์ ํ ์๊ณ์น๋ณด๋ค ๋์ผ๋ฉด ์ถ๊ฐ | |
try: | |
relevance_score = float(relevance) # ์ซ์๋ก ๋ณํ | |
if relevance_score >= relevance_threshold: | |
relevant_texts.append(text) | |
relevant_files.append(filename) | |
except ValueError: | |
print(f"Relevance ํ๊ฐ์ ์ค๋ฅ๊ฐ ์์: {relevance}") | |
if relevant_texts: | |
combined_text = " ".join(relevant_texts) # ์ค๋ฐ๊ฟ ์์ด ๊ณต๋ฐฑ์ผ๋ก ์ฐ๊ฒฐ | |
else: | |
return "No PDF files found with sufficient relevance.", total_tokens_used | |
# GPT์๊ฒ ์ต์ข ๊ฒฐํฉ๋ ํ ์คํธ๋ก ์ง๋ฌธ | |
final_response, conclusion_tokens = ask_gpt_based_on_single_pdf(combined_text[:max_chars], question) | |
relevant_file_count = len(relevant_files) | |
total_tokens_used += conclusion_tokens | |
return ( | |
f"Final GPT response: {final_response}\n" | |
f"Relevant PDF files: {relevant_file_count}\n", | |
total_tokens_used | |
) | |
#return "Final GPT response: " + final_response + "\n" + f"Relevant PDF files: {relevant_file_count}" + "\n" + total_tokens_used | |
except Exception as e: | |
return f"Error occurred: {str(e)}", 0 | |
def list_pdfs_in_folder(): | |
_, pdf_files = extract_texts_from_drive_folder(drive_folder_path) | |
return f"Number of PDF files in the folder: {len(pdf_files)}" | |
def answer_question_with_gpt(question): | |
response, total_tokens_used = ask_gpt_based_on_pdfs_from_drive(question) | |
return f"{response}Total tokens used: {total_tokens_used}" | |
interface = gr.Interface( | |
fn=answer_question_with_gpt, | |
inputs=[gr.Textbox(label="Enter your question")], | |
outputs=gr.Textbox(label="GPT response and relevant files"), | |
title="Chromatography Resin Bot", | |
description=list_pdfs_in_folder() | |
) | |
interface.launch() | |