import os import fitz # PyMuPDF import openai import gradio as gr # OpenAI API 키 설정 openai.api_key = os.getenv('OPENAI_API_KEY') # Define the Google Drive folder path where PDF files are stored drive_folder_path = '/home/user/app/Resin' def extract_text_from_pdf(pdf_path, max_chars=5000): text = "" try: with fitz.open(pdf_path) as doc: for page in doc: text += page.get_text() # 페이지에서 텍스트 추출 if len(text) > max_chars: break # 최대 텍스트 길이를 넘으면 중지 except Exception as e: print(f"Error extracting text from {pdf_path}: {e}") return text[:max_chars] # 텍스트 길이 제한 def preprocess_text(text): clean_text = ' '.join(text.split()) # 불필요한 공백과 줄바꿈을 제거 return clean_text def extract_texts_from_drive_folder(folder_path): all_texts = {} pdf_files = [] for filename in os.listdir(folder_path): if filename.endswith(".pdf"): file_path = os.path.join(folder_path, filename) pdf_files.append(filename) # 파일 이름을 목록에 추가 print(f"Processing file: {file_path}") try: text = extract_text_from_pdf(file_path) all_texts[filename] = text except Exception as e: print(f"Error processing file {file_path}: {e}") return all_texts, pdf_files def evaluate_relevance(text, question): try: messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Please evaluate how relevant the following text is to the question from 1 to 10, return only the number"}, {"role": "user", "content": "Text: {}, Question: {}".format(text, question)} ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, max_tokens=1024, temperature=0.7 ) relevance = response.choices[0].message['content'].strip() total_tokens = response['usage']['total_tokens'] return relevance, total_tokens except Exception as e: return f"Error occurred: {str(e)}" def ask_gpt_based_on_single_pdf(text, question): try: messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Based on the following content, please answer the question: {}, Question: {}".format(text, question)} ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, max_tokens=2048, temperature=0.7 ) gpt_response = response.choices[0].message['content'].strip() total_tokens = response['usage']['total_tokens'] return gpt_response, total_tokens except Exception as e: return f"Error occurred: {str(e)}" def ask_gpt_based_on_pdfs_from_drive(question, relevance_threshold=4, max_chars=10000): try: pdf_texts, pdf_files = extract_texts_from_drive_folder(drive_folder_path) relevant_texts = [] relevant_files = [] total_tokens_used = 0 # 총 사용된 토큰 수를 추적 # 각 PDF 파일의 텍스트 관련성 평가 for filename, text in pdf_texts.items(): print(f"Evaluating relevance of {filename}...") text = preprocess_text(text[:max_chars]) # 텍스트 길이 제한 및 전처리 적용 relevance, tokens_used = evaluate_relevance(text, question) total_tokens_used += tokens_used print(f"{filename} 관련성: {relevance}") # 사용자가 설정한 임계치보다 높으면 추가 try: relevance_score = float(relevance) # 숫자로 변환 if relevance_score >= relevance_threshold: relevant_texts.append(text) relevant_files.append(filename) except ValueError: print(f"Relevance 평가에 오류가 있음: {relevance}") if relevant_texts: combined_text = " ".join(relevant_texts) # 줄바꿈 없이 공백으로 연결 else: return "No PDF files found with sufficient relevance.", total_tokens_used # GPT에게 최종 결합된 텍스트로 질문 final_response, conclusion_tokens = ask_gpt_based_on_single_pdf(combined_text[:max_chars], question) relevant_file_count = len(relevant_files) total_tokens_used += conclusion_tokens return ( f"Final GPT response: {final_response}\n" f"Relevant PDF files: {relevant_file_count}\n", total_tokens_used ) #return "Final GPT response: " + final_response + "\n" + f"Relevant PDF files: {relevant_file_count}" + "\n" + total_tokens_used except Exception as e: return f"Error occurred: {str(e)}", 0 def list_pdfs_in_folder(): _, pdf_files = extract_texts_from_drive_folder(drive_folder_path) return f"Number of PDF files in the folder: {len(pdf_files)}" def answer_question_with_gpt(question): response, total_tokens_used = ask_gpt_based_on_pdfs_from_drive(question) return f"{response}Total tokens used: {total_tokens_used}" interface = gr.Interface( fn=answer_question_with_gpt, inputs=[gr.Textbox(label="Enter your question")], outputs=gr.Textbox(label="GPT response and relevant files"), title="Chromatography Resin Bot", description=list_pdfs_in_folder() ) interface.launch()