Spaces:
Sleeping
Sleeping
bluemiracle0214
commited on
Commit
โข
37d744f
1
Parent(s):
6f2b7d4
Update app.py
Browse files
app.py
CHANGED
@@ -15,44 +15,34 @@ def extract_text_from_pdf(pdf_path, max_chars=5000):
|
|
15 |
try:
|
16 |
with fitz.open(pdf_path) as doc:
|
17 |
for page in doc:
|
18 |
-
text += page.get_text()
|
19 |
if len(text) > max_chars:
|
20 |
-
break
|
21 |
except Exception as e:
|
22 |
print(f"Error extracting text from {pdf_path}: {e}")
|
23 |
-
return text[:max_chars]
|
24 |
|
25 |
def preprocess_text(text):
|
26 |
-
clean_text = ' '.join(text.split())
|
27 |
return clean_text
|
28 |
-
|
29 |
def extract_texts_from_drive_folder(folder_path):
|
30 |
all_texts = {}
|
31 |
pdf_files = []
|
32 |
-
|
33 |
-
print(folder_path)
|
34 |
-
|
35 |
-
if not os.path.exists(folder_path):
|
36 |
-
return all_texts, pdf_files # ํด๋๊ฐ ์์ผ๋ฉด ๋น ๋ชฉ๋ก ๋ฐํ
|
37 |
-
|
38 |
for filename in os.listdir(folder_path):
|
39 |
if filename.endswith(".pdf"):
|
40 |
file_path = os.path.join(folder_path, filename)
|
41 |
-
pdf_files.append(filename)
|
42 |
print(f"Processing file: {file_path}")
|
43 |
try:
|
44 |
text = extract_text_from_pdf(file_path)
|
45 |
all_texts[filename] = text
|
46 |
except Exception as e:
|
47 |
print(f"Error processing file {file_path}: {e}")
|
48 |
-
|
49 |
-
if not pdf_files:
|
50 |
-
print("No PDF files found in the specified folder.")
|
51 |
return all_texts, pdf_files
|
52 |
|
53 |
def evaluate_relevance(text, question):
|
54 |
try:
|
55 |
-
# Request GPT to evaluate the relevance of the text
|
56 |
messages = [
|
57 |
{"role": "system", "content": "You are a helpful assistant."},
|
58 |
{"role": "user", "content": "Please evaluate how relevant the following text is to the question from 1 to 10, return only the number"},
|
@@ -62,12 +52,13 @@ def evaluate_relevance(text, question):
|
|
62 |
response = openai.ChatCompletion.create(
|
63 |
model="gpt-3.5-turbo",
|
64 |
messages=messages,
|
65 |
-
max_tokens=
|
66 |
temperature=0.7
|
67 |
)
|
68 |
|
69 |
relevance = response.choices[0].message['content'].strip()
|
70 |
-
|
|
|
71 |
|
72 |
except Exception as e:
|
73 |
return f"Error occurred: {str(e)}"
|
@@ -82,12 +73,13 @@ def ask_gpt_based_on_single_pdf(text, question):
|
|
82 |
response = openai.ChatCompletion.create(
|
83 |
model="gpt-3.5-turbo",
|
84 |
messages=messages,
|
85 |
-
max_tokens=
|
86 |
temperature=0.7
|
87 |
)
|
88 |
|
89 |
gpt_response = response.choices[0].message['content'].strip()
|
90 |
-
|
|
|
91 |
|
92 |
except Exception as e:
|
93 |
return f"Error occurred: {str(e)}"
|
@@ -98,40 +90,48 @@ def ask_gpt_based_on_pdfs_from_drive(question, relevance_threshold=4, max_chars=
|
|
98 |
|
99 |
relevant_texts = []
|
100 |
relevant_files = []
|
|
|
101 |
|
|
|
102 |
for filename, text in pdf_texts.items():
|
103 |
print(f"Evaluating relevance of {filename}...")
|
104 |
-
text = preprocess_text(text[:max_chars])
|
105 |
-
relevance = evaluate_relevance(text, question)
|
106 |
-
|
|
|
107 |
|
|
|
108 |
try:
|
109 |
-
relevance_score = float(relevance)
|
110 |
if relevance_score >= relevance_threshold:
|
111 |
relevant_texts.append(text)
|
112 |
relevant_files.append(filename)
|
113 |
except ValueError:
|
114 |
-
print(f"
|
115 |
|
116 |
if relevant_texts:
|
117 |
combined_text = " ".join(relevant_texts) # ์ค๋ฐ๊ฟ ์์ด ๊ณต๋ฐฑ์ผ๋ก ์ฐ๊ฒฐ
|
118 |
else:
|
119 |
-
return "No PDF files found with sufficient relevance."
|
|
|
|
|
|
|
120 |
|
121 |
-
final_response = ask_gpt_based_on_single_pdf(combined_text[:max_chars], question)
|
122 |
relevant_file_count = len(relevant_files)
|
|
|
123 |
|
124 |
-
return "Final GPT response: " + final_response + "\n" + f"Relevant PDF files: {relevant_file_count}" + "\n"
|
125 |
|
126 |
except Exception as e:
|
127 |
-
return f"Error occurred: {str(e)}"
|
128 |
|
129 |
def list_pdfs_in_folder():
|
130 |
_, pdf_files = extract_texts_from_drive_folder(drive_folder_path)
|
131 |
return f"Number of PDF files in the folder: {len(pdf_files)}"
|
132 |
|
133 |
def answer_question_with_gpt(question):
|
134 |
-
|
|
|
135 |
|
136 |
interface = gr.Interface(
|
137 |
fn=answer_question_with_gpt,
|
|
|
15 |
try:
|
16 |
with fitz.open(pdf_path) as doc:
|
17 |
for page in doc:
|
18 |
+
text += page.get_text() # ํ์ด์ง์์ ํ
์คํธ ์ถ์ถ
|
19 |
if len(text) > max_chars:
|
20 |
+
break # ์ต๋ ํ
์คํธ ๊ธธ์ด๋ฅผ ๋์ผ๋ฉด ์ค์ง
|
21 |
except Exception as e:
|
22 |
print(f"Error extracting text from {pdf_path}: {e}")
|
23 |
+
return text[:max_chars] # ํ
์คํธ ๊ธธ์ด ์ ํ
|
24 |
|
25 |
def preprocess_text(text):
|
26 |
+
clean_text = ' '.join(text.split()) # ๋ถํ์ํ ๊ณต๋ฐฑ๊ณผ ์ค๋ฐ๊ฟ์ ์ ๊ฑฐ
|
27 |
return clean_text
|
28 |
+
|
29 |
def extract_texts_from_drive_folder(folder_path):
|
30 |
all_texts = {}
|
31 |
pdf_files = []
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
for filename in os.listdir(folder_path):
|
33 |
if filename.endswith(".pdf"):
|
34 |
file_path = os.path.join(folder_path, filename)
|
35 |
+
pdf_files.append(filename) # ํ์ผ ์ด๋ฆ์ ๋ชฉ๋ก์ ์ถ๊ฐ
|
36 |
print(f"Processing file: {file_path}")
|
37 |
try:
|
38 |
text = extract_text_from_pdf(file_path)
|
39 |
all_texts[filename] = text
|
40 |
except Exception as e:
|
41 |
print(f"Error processing file {file_path}: {e}")
|
|
|
|
|
|
|
42 |
return all_texts, pdf_files
|
43 |
|
44 |
def evaluate_relevance(text, question):
|
45 |
try:
|
|
|
46 |
messages = [
|
47 |
{"role": "system", "content": "You are a helpful assistant."},
|
48 |
{"role": "user", "content": "Please evaluate how relevant the following text is to the question from 1 to 10, return only the number"},
|
|
|
52 |
response = openai.ChatCompletion.create(
|
53 |
model="gpt-3.5-turbo",
|
54 |
messages=messages,
|
55 |
+
max_tokens=1024,
|
56 |
temperature=0.7
|
57 |
)
|
58 |
|
59 |
relevance = response.choices[0].message['content'].strip()
|
60 |
+
total_tokens = response['usage']['total_tokens']
|
61 |
+
return relevance, total_tokens
|
62 |
|
63 |
except Exception as e:
|
64 |
return f"Error occurred: {str(e)}"
|
|
|
73 |
response = openai.ChatCompletion.create(
|
74 |
model="gpt-3.5-turbo",
|
75 |
messages=messages,
|
76 |
+
max_tokens=2048,
|
77 |
temperature=0.7
|
78 |
)
|
79 |
|
80 |
gpt_response = response.choices[0].message['content'].strip()
|
81 |
+
total_tokens = response['usage']['total_tokens']
|
82 |
+
return gpt_response, total_tokens
|
83 |
|
84 |
except Exception as e:
|
85 |
return f"Error occurred: {str(e)}"
|
|
|
90 |
|
91 |
relevant_texts = []
|
92 |
relevant_files = []
|
93 |
+
total_tokens_used = 0 # ์ด ์ฌ์ฉ๋ ํ ํฐ ์๋ฅผ ์ถ์
|
94 |
|
95 |
+
# ๊ฐ PDF ํ์ผ์ ํ
์คํธ ๊ด๋ จ์ฑ ํ๊ฐ
|
96 |
for filename, text in pdf_texts.items():
|
97 |
print(f"Evaluating relevance of {filename}...")
|
98 |
+
text = preprocess_text(text[:max_chars]) # ํ
์คํธ ๊ธธ์ด ์ ํ ๋ฐ ์ ์ฒ๋ฆฌ ์ ์ฉ
|
99 |
+
relevance, tokens_used = evaluate_relevance(text, question)
|
100 |
+
total_tokens_used += tokens_used
|
101 |
+
print(f"{filename} ๊ด๋ จ์ฑ: {relevance}")
|
102 |
|
103 |
+
# ์ฌ์ฉ์๊ฐ ์ค์ ํ ์๊ณ์น๋ณด๋ค ๋์ผ๋ฉด ์ถ๊ฐ
|
104 |
try:
|
105 |
+
relevance_score = float(relevance) # ์ซ์๋ก ๋ณํ
|
106 |
if relevance_score >= relevance_threshold:
|
107 |
relevant_texts.append(text)
|
108 |
relevant_files.append(filename)
|
109 |
except ValueError:
|
110 |
+
print(f"Relevance ํ๊ฐ์ ์ค๋ฅ๊ฐ ์์: {relevance}")
|
111 |
|
112 |
if relevant_texts:
|
113 |
combined_text = " ".join(relevant_texts) # ์ค๋ฐ๊ฟ ์์ด ๊ณต๋ฐฑ์ผ๋ก ์ฐ๊ฒฐ
|
114 |
else:
|
115 |
+
return "No PDF files found with sufficient relevance.", total_tokens_used
|
116 |
+
|
117 |
+
# GPT์๊ฒ ์ต์ข
๊ฒฐํฉ๋ ํ
์คํธ๋ก ์ง๋ฌธ
|
118 |
+
final_response, conclusion_tokens = ask_gpt_based_on_single_pdf(combined_text[:max_chars], question)
|
119 |
|
|
|
120 |
relevant_file_count = len(relevant_files)
|
121 |
+
total_tokens_used += conclusion_tokens
|
122 |
|
123 |
+
return "Final GPT response: " + final_response + "\n" + f"Relevant PDF files: {relevant_file_count}" + "\n" + total_tokens_used
|
124 |
|
125 |
except Exception as e:
|
126 |
+
return f"Error occurred: {str(e)}", 0
|
127 |
|
128 |
def list_pdfs_in_folder():
|
129 |
_, pdf_files = extract_texts_from_drive_folder(drive_folder_path)
|
130 |
return f"Number of PDF files in the folder: {len(pdf_files)}"
|
131 |
|
132 |
def answer_question_with_gpt(question):
|
133 |
+
response, total_tokens_used = ask_gpt_based_on_pdfs_from_drive(question)
|
134 |
+
return f"{response}Total tokens used: {total_tokens_used}"
|
135 |
|
136 |
interface = gr.Interface(
|
137 |
fn=answer_question_with_gpt,
|