bluemiracle0214 commited on
Commit
37d744f
โ€ข
1 Parent(s): 6f2b7d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -30
app.py CHANGED
@@ -15,44 +15,34 @@ def extract_text_from_pdf(pdf_path, max_chars=5000):
15
  try:
16
  with fitz.open(pdf_path) as doc:
17
  for page in doc:
18
- text += page.get_text()
19
  if len(text) > max_chars:
20
- break
21
  except Exception as e:
22
  print(f"Error extracting text from {pdf_path}: {e}")
23
- return text[:max_chars]
24
 
25
  def preprocess_text(text):
26
- clean_text = ' '.join(text.split())
27
  return clean_text
28
-
29
  def extract_texts_from_drive_folder(folder_path):
30
  all_texts = {}
31
  pdf_files = []
32
-
33
- print(folder_path)
34
-
35
- if not os.path.exists(folder_path):
36
- return all_texts, pdf_files # ํด๋”๊ฐ€ ์—†์œผ๋ฉด ๋นˆ ๋ชฉ๋ก ๋ฐ˜ํ™˜
37
-
38
  for filename in os.listdir(folder_path):
39
  if filename.endswith(".pdf"):
40
  file_path = os.path.join(folder_path, filename)
41
- pdf_files.append(filename)
42
  print(f"Processing file: {file_path}")
43
  try:
44
  text = extract_text_from_pdf(file_path)
45
  all_texts[filename] = text
46
  except Exception as e:
47
  print(f"Error processing file {file_path}: {e}")
48
-
49
- if not pdf_files:
50
- print("No PDF files found in the specified folder.")
51
  return all_texts, pdf_files
52
 
53
  def evaluate_relevance(text, question):
54
  try:
55
- # Request GPT to evaluate the relevance of the text
56
  messages = [
57
  {"role": "system", "content": "You are a helpful assistant."},
58
  {"role": "user", "content": "Please evaluate how relevant the following text is to the question from 1 to 10, return only the number"},
@@ -62,12 +52,13 @@ def evaluate_relevance(text, question):
62
  response = openai.ChatCompletion.create(
63
  model="gpt-3.5-turbo",
64
  messages=messages,
65
- max_tokens=200,
66
  temperature=0.7
67
  )
68
 
69
  relevance = response.choices[0].message['content'].strip()
70
- return relevance
 
71
 
72
  except Exception as e:
73
  return f"Error occurred: {str(e)}"
@@ -82,12 +73,13 @@ def ask_gpt_based_on_single_pdf(text, question):
82
  response = openai.ChatCompletion.create(
83
  model="gpt-3.5-turbo",
84
  messages=messages,
85
- max_tokens=1000,
86
  temperature=0.7
87
  )
88
 
89
  gpt_response = response.choices[0].message['content'].strip()
90
- return gpt_response
 
91
 
92
  except Exception as e:
93
  return f"Error occurred: {str(e)}"
@@ -98,40 +90,48 @@ def ask_gpt_based_on_pdfs_from_drive(question, relevance_threshold=4, max_chars=
98
 
99
  relevant_texts = []
100
  relevant_files = []
 
101
 
 
102
  for filename, text in pdf_texts.items():
103
  print(f"Evaluating relevance of {filename}...")
104
- text = preprocess_text(text[:max_chars])
105
- relevance = evaluate_relevance(text, question)
106
- print(f"{filename} relevance: {relevance}")
 
107
 
 
108
  try:
109
- relevance_score = float(relevance)
110
  if relevance_score >= relevance_threshold:
111
  relevant_texts.append(text)
112
  relevant_files.append(filename)
113
  except ValueError:
114
- print(f"Error in relevance evaluation: {relevance}")
115
 
116
  if relevant_texts:
117
  combined_text = " ".join(relevant_texts) # ์ค„๋ฐ”๊ฟˆ ์—†์ด ๊ณต๋ฐฑ์œผ๋กœ ์—ฐ๊ฒฐ
118
  else:
119
- return "No PDF files found with sufficient relevance."
 
 
 
120
 
121
- final_response = ask_gpt_based_on_single_pdf(combined_text[:max_chars], question)
122
  relevant_file_count = len(relevant_files)
 
123
 
124
- return "Final GPT response: " + final_response + "\n" + f"Relevant PDF files: {relevant_file_count}" + "\n"
125
 
126
  except Exception as e:
127
- return f"Error occurred: {str(e)}"
128
 
129
  def list_pdfs_in_folder():
130
  _, pdf_files = extract_texts_from_drive_folder(drive_folder_path)
131
  return f"Number of PDF files in the folder: {len(pdf_files)}"
132
 
133
  def answer_question_with_gpt(question):
134
- return ask_gpt_based_on_pdfs_from_drive(question)
 
135
 
136
  interface = gr.Interface(
137
  fn=answer_question_with_gpt,
 
15
  try:
16
  with fitz.open(pdf_path) as doc:
17
  for page in doc:
18
+ text += page.get_text() # ํŽ˜์ด์ง€์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ
19
  if len(text) > max_chars:
20
+ break # ์ตœ๋Œ€ ํ…์ŠคํŠธ ๊ธธ์ด๋ฅผ ๋„˜์œผ๋ฉด ์ค‘์ง€
21
  except Exception as e:
22
  print(f"Error extracting text from {pdf_path}: {e}")
23
+ return text[:max_chars] # ํ…์ŠคํŠธ ๊ธธ์ด ์ œํ•œ
24
 
25
  def preprocess_text(text):
26
+ clean_text = ' '.join(text.split()) # ๋ถˆํ•„์š”ํ•œ ๊ณต๋ฐฑ๊ณผ ์ค„๋ฐ”๊ฟˆ์„ ์ œ๊ฑฐ
27
  return clean_text
28
+
29
  def extract_texts_from_drive_folder(folder_path):
30
  all_texts = {}
31
  pdf_files = []
 
 
 
 
 
 
32
  for filename in os.listdir(folder_path):
33
  if filename.endswith(".pdf"):
34
  file_path = os.path.join(folder_path, filename)
35
+ pdf_files.append(filename) # ํŒŒ์ผ ์ด๋ฆ„์„ ๋ชฉ๋ก์— ์ถ”๊ฐ€
36
  print(f"Processing file: {file_path}")
37
  try:
38
  text = extract_text_from_pdf(file_path)
39
  all_texts[filename] = text
40
  except Exception as e:
41
  print(f"Error processing file {file_path}: {e}")
 
 
 
42
  return all_texts, pdf_files
43
 
44
  def evaluate_relevance(text, question):
45
  try:
 
46
  messages = [
47
  {"role": "system", "content": "You are a helpful assistant."},
48
  {"role": "user", "content": "Please evaluate how relevant the following text is to the question from 1 to 10, return only the number"},
 
52
  response = openai.ChatCompletion.create(
53
  model="gpt-3.5-turbo",
54
  messages=messages,
55
+ max_tokens=1024,
56
  temperature=0.7
57
  )
58
 
59
  relevance = response.choices[0].message['content'].strip()
60
+ total_tokens = response['usage']['total_tokens']
61
+ return relevance, total_tokens
62
 
63
  except Exception as e:
64
  return f"Error occurred: {str(e)}"
 
73
  response = openai.ChatCompletion.create(
74
  model="gpt-3.5-turbo",
75
  messages=messages,
76
+ max_tokens=2048,
77
  temperature=0.7
78
  )
79
 
80
  gpt_response = response.choices[0].message['content'].strip()
81
+ total_tokens = response['usage']['total_tokens']
82
+ return gpt_response, total_tokens
83
 
84
  except Exception as e:
85
  return f"Error occurred: {str(e)}"
 
90
 
91
  relevant_texts = []
92
  relevant_files = []
93
+ total_tokens_used = 0 # ์ด ์‚ฌ์šฉ๋œ ํ† ํฐ ์ˆ˜๋ฅผ ์ถ”์ 
94
 
95
+ # ๊ฐ PDF ํŒŒ์ผ์˜ ํ…์ŠคํŠธ ๊ด€๋ จ์„ฑ ํ‰๊ฐ€
96
  for filename, text in pdf_texts.items():
97
  print(f"Evaluating relevance of {filename}...")
98
+ text = preprocess_text(text[:max_chars]) # ํ…์ŠคํŠธ ๊ธธ์ด ์ œํ•œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์ ์šฉ
99
+ relevance, tokens_used = evaluate_relevance(text, question)
100
+ total_tokens_used += tokens_used
101
+ print(f"{filename} ๊ด€๋ จ์„ฑ: {relevance}")
102
 
103
+ # ์‚ฌ์šฉ์ž๊ฐ€ ์„ค์ •ํ•œ ์ž„๊ณ„์น˜๋ณด๋‹ค ๋†’์œผ๋ฉด ์ถ”๊ฐ€
104
  try:
105
+ relevance_score = float(relevance) # ์ˆซ์ž๋กœ ๋ณ€ํ™˜
106
  if relevance_score >= relevance_threshold:
107
  relevant_texts.append(text)
108
  relevant_files.append(filename)
109
  except ValueError:
110
+ print(f"Relevance ํ‰๊ฐ€์— ์˜ค๋ฅ˜๊ฐ€ ์žˆ์Œ: {relevance}")
111
 
112
  if relevant_texts:
113
  combined_text = " ".join(relevant_texts) # ์ค„๋ฐ”๊ฟˆ ์—†์ด ๊ณต๋ฐฑ์œผ๋กœ ์—ฐ๊ฒฐ
114
  else:
115
+ return "No PDF files found with sufficient relevance.", total_tokens_used
116
+
117
+ # GPT์—๊ฒŒ ์ตœ์ข… ๊ฒฐํ•ฉ๋œ ํ…์ŠคํŠธ๋กœ ์งˆ๋ฌธ
118
+ final_response, conclusion_tokens = ask_gpt_based_on_single_pdf(combined_text[:max_chars], question)
119
 
 
120
  relevant_file_count = len(relevant_files)
121
+ total_tokens_used += conclusion_tokens
122
 
123
+ return "Final GPT response: " + final_response + "\n" + f"Relevant PDF files: {relevant_file_count}" + "\n" + total_tokens_used
124
 
125
  except Exception as e:
126
+ return f"Error occurred: {str(e)}", 0
127
 
128
  def list_pdfs_in_folder():
129
  _, pdf_files = extract_texts_from_drive_folder(drive_folder_path)
130
  return f"Number of PDF files in the folder: {len(pdf_files)}"
131
 
132
  def answer_question_with_gpt(question):
133
+ response, total_tokens_used = ask_gpt_based_on_pdfs_from_drive(question)
134
+ return f"{response}Total tokens used: {total_tokens_used}"
135
 
136
  interface = gr.Interface(
137
  fn=answer_question_with_gpt,