anisrashidov commited on
Commit
84d76e8
ยท
verified ยท
1 Parent(s): aeb7261

Update crawler.py

Browse files
Files changed (1) hide show
  1. crawler.py +310 -88
crawler.py CHANGED
@@ -1,98 +1,320 @@
1
- from bs4 import BeautifulSoup
2
- import re
3
- import requests as r
4
- from html2text import html2text
5
- import tqdm
6
-
7
- def process_url(url):
8
- """Process a single URL to fetch answers."""
9
- try:
10
- response = r.get(url)
11
- soup = BeautifulSoup(response.text, "html.parser")
12
- # answers = []
13
- # for idx in range(1, 100):
14
- # answer = soup.find('div', {'id': f'answer_{idx}'})
15
- # if answer:
16
- # answers.append(answer)
17
- # else:
18
- # break
19
- answers = soup.find_all('div', {'id': re.compile(r'answer_\d+')})
20
- answers = [html2text(str(answer.find('div', {'class': "answerDetail"}).prettify()))
21
- for answer in answers if answer.find('div', {'class': "answerDetail"})]
22
- title = soup.find('div', {'class': 'endTitleSection'}).text.strip()
23
- questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip()
24
- # print("Question: ", questionDetails, '\n')
25
- title = title.replace("์งˆ๋ฌธ", '').strip()
26
- print("Answers extracted from: \n", url)
27
- print(len(answers))
28
- print('-'*60)
29
- return {
30
- "title": title,
31
- "questionDetails": questionDetails,
32
- "url": url,
33
- "answers": answers
34
- }
35
- except Exception as e:
36
- print(f"Error processing URL {url}: {e}")
37
- with open('error_urls.txt', 'w') as f:
38
- f.write(url + '\n')
39
- return {"title": '', "questionDetails": '', "url": url, "answers": ''}
40
-
41
- def get_answers(results_a_elements, query):
42
- """Fetch answers for all the extracted result links."""
43
- if not results_a_elements:
44
- print("No results found.")
45
- return []
46
-
47
- print("Result links extracted: ", len(results_a_elements))
48
-
49
- # Limit the number of parallel processes for better resource management
50
- # max_processes = 4
51
-
52
- # with multiprocessing.Pool(processes=max_processes) as pool:
53
- # results = pool.map(process_url, results_a_elements)
54
 
55
- results = []
56
- for url in tqdm.tqdm(results_a_elements):
57
- results.append(process_url(url))
58
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- def get_search_results(query, num_pages):
61
- """Fetch search results for the given query from Naver ์ง€์‹in."""
62
  results = []
63
- for page in range(1, num_pages + 1):
64
- url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}"
65
- print("Starting the scraping process for:\n", url)
66
-
67
- try:
68
- response = r.get(url)
69
- soup = BeautifulSoup(response.text, "html.parser")
70
- results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"})
71
- results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")]
72
- results += results_a_elements
73
- except Exception as e:
74
- print(f"Error while fetching search results: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return results
76
 
77
- def extract_data(query, num_pages=150) -> list[dict[str, object]]:
78
- results_a_elements = get_search_results(query, num_pages)
79
- answers = get_answers(results_a_elements, query)
80
- print("Total answers collected:", len(answers))
81
- return answers
82
 
83
- # if __name__ == "__main__":
84
- # start = time.time()
85
- # query = "์žฅ๋ž˜ํฌ๋ง, ์ธ๊ณต์ง€๋Šฅ ๊ฐœ๋ฐœ์ž/์—ฐ๊ตฌ์›, ํŒŒ์ด์ฌ, ์ค‘ํ•™์ƒ ์ˆ˜์ค€, ํŒŒ์ด์ฌ ์„ค์น˜, ๋„์„œ ์ถ”์ฒœ"
86
- # answers = process_query(query)
87
- # print("Total answers collected:", len(answers))
88
- # print("Time taken: ", time.time() - start)
89
- # # print(answers)
 
 
 
 
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
 
94
- # AJAX URL:
95
- # https://kin.naver.com/ajax/detail/answerList.naver?
96
- # dirId=401030201&docId=292159869
97
- # &answerSortType=DEFAULT&answerViewType=DETAIL
98
- # &answerNo=&page=2&count=5&_=1736131792605
 
1
+ # from fastapi import FastAPI
2
+ # from fastapi.middleware.cors import CORSMiddleware
3
+ from openai import OpenAI
4
+ from google import genai
5
+ from crawler import extract_data
6
+ import time
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import gradio as gr
10
+ # import multiprocessing
11
+ from together import Together
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ load_dotenv("../.env")
14
+ # print("Environment variables:", os.environ)
15
+
16
+
17
+ together_client = Together(
18
+ api_key=os.getenv("TOGETHER_API_KEY"),
19
+ )
20
+
21
+ gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
22
+ genai_model = "gemini-2.0-flash-exp"
23
+
24
+ perplexity_client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai")
25
+ gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
26
+
27
+
28
+
29
+ def get_answers( query: str ):
30
+ context = extract_data(query, 1)
31
+ return context
32
+
33
+ # with torch.no_grad():
34
+ # model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta')
35
+ # tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta', TOKENIZERS_PARALLELISM=True)
36
+
37
+ # def cal_score(input_data):
38
+ # # Initialize model and tokenizer inside the function
39
+ # with torch.no_grad():
40
+ # inputs = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt")
41
+ # outputs = model.get_input_embeddings(inputs["input_ids"])
42
+
43
+ # a, b = outputs[0], outputs[1] # Adjust based on your model's output structure
44
+
45
+ # # Normalize the tensors
46
+ # a_norm = a / a.norm(dim=1)[:, None]
47
+ # b_norm = b / b.norm(dim=1)[:, None]
48
+
49
+ # print(a.shape, b.shape)
50
+
51
+ # # Return the similarity score
52
+ # # return torch.mm(a_norm, b_norm.transpose(0, 1)) * 100
53
+ # a_norm = a_norm.reshape(1, -1)
54
+ # b_norm = b_norm.reshape(1, -1)
55
+ # similarity_score = cosine_similarity(a_norm, b_norm)
56
+
57
+ # # Return the similarity score (assuming you want the average of the similarities across the tokens)
58
+ # return similarity_score # Scalar value
59
+
60
+
61
+
62
+ # def get_match_scores( message: str, query: str, answers: list[dict[str, object]] ):
63
+ # start = time.time()
64
+ # max_processes = 4
65
+ # with multiprocessing.Pool(processes=max_processes) as pool:
66
+ # scores = pool.map(cal_score, [[answer['questionDetails'], message] for answer in answers])
67
+ # print(f"Time taken to compare: {time.time() - start} seconds")
68
+ # print("Scores: ", scores)
69
+ # return scores
70
+
71
+ def get_naver_answers( message: str ):
72
+ print(">>> Starting naver extraction...")
73
+ print("Question: ", message)
74
+ naver_start_time = time.time()
75
+ response = gemini_client.models.generate_content(
76
+ model = genai_model,
77
+ contents=f"{message}\n ์œ„์˜ ๋‚ด์šฉ์„ ์งง์€ ์ œ๋ชฉ์œผ๋กœ ์š”์•ฝํ•ฉ๋‹ˆ๋‹ค. ์ œ๋ชฉ๋งŒ ๋ณด์—ฌ์ฃผ์„ธ์š”. ๋Œ€๋‹ตํ•˜์ง€ ๋งˆ์„ธ์š”. ํ•œ๊ตญ์–ด๋กœ๋งŒ ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”!!!",
78
+ )
79
+ query = response.text
80
+ print( "Query: ", query)
81
+
82
+ context = get_answers( query )
83
+
84
+ sorted_answers = ['. '.join(answer['answers']) for answer in context]
85
+ naver_end_time = time.time()
86
+ print(f"Time taken to extract from Naver: { naver_end_time - naver_start_time } seconds")
87
+ document = '\n'.join(sorted_answers)
88
+ return document, naver_end_time - naver_start_time
89
+
90
+ def get_qwen_big_answer( message: str ):
91
+ print(">>> Starting Qwen 72B extraction...")
92
+ qwen_start_time = time.time()
93
+ response = together_client.chat.completions.create(
94
+ model="Qwen/Qwen2.5-72B-Instruct-Turbo",
95
+ messages=[
96
+ {"role": "system", "content": "You are a helpful question-answer, CONCISE conversation assistant that answers in Korean."},
97
+ {"role": "user", "content": message}
98
+ ]
99
+ )
100
+
101
+ qwen_end_time = time.time()
102
+ print(f"Time taken to extract from Qwen: { qwen_end_time - qwen_start_time } seconds")
103
+ return response.choices[0].message.content, qwen_end_time - qwen_start_time
104
+
105
+ def get_qwen_small_answer( message: str ):
106
+ print(">>> Starting Qwen 7B extraction...")
107
+ qwen_start_time = time.time()
108
+ response = together_client.chat.completions.create(
109
+ model="Qwen/Qwen2.5-7B-Instruct-Turbo",
110
+ messages=[
111
+ {"role": "system", "content": "You are a helpful question-answer, conversation assistant that answers in Korean. Your responses should sound human-like."},
112
+ {"role": "user", "content": message}
113
+ ],
114
+ max_tokens = None
115
+ #TODO: Change the messages option
116
+ )
117
+ qwen_end_time = time.time()
118
+ print(f"Time taken to extract from Qwen: { qwen_end_time - qwen_start_time } seconds")
119
+ return response.choices[0].message.content, qwen_end_time - qwen_start_time
120
+
121
+ def get_llama_small_answer( message: str ):
122
+ print(">>> Starting Llama 3.1 8B extraction...")
123
+ llama_start_time = time.time()
124
+ response = together_client.chat.completions.create(
125
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
126
+ messages=[
127
+ {"role": "system", "content": "You are an artificial intelligence assistant and you need to engage in a helpful, CONCISE, polite question-answer conversation with a user."},
128
+ {
129
+ "role": "user",
130
+ "content": message
131
+ }
132
+ ]
133
+ )
134
+ llama_end_time = time.time()
135
+ print(f"Time taken to extract from Llama: { llama_end_time - llama_start_time } seconds")
136
+ return response.choices[0].message.content, llama_end_time - llama_start_time
137
+
138
+ def get_llama_big_answer( message: str ):
139
+ print(">>> Starting Llama 3.1 70B extraction...")
140
+ llama_start_time = time.time()
141
+ response = together_client.chat.completions.create(
142
+ model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
143
+ messages=[
144
+ {"role": "system", "content": "You are an artificial intelligence assistant and you need to engage in a helpful, CONCISE, polite question-answer conversation with a user."},
145
+ {
146
+ "role": "user",
147
+ "content": message
148
+ }
149
+ ]
150
+ )
151
+ llama_end_time = time.time()
152
+ print(f"Time taken to extract from Llama: { llama_end_time - llama_start_time } seconds")
153
+ return response.choices[0].message.content, llama_end_time - llama_start_time
154
+
155
+
156
+ def get_gemini_answer( message: str ):
157
+ print(">>> Starting gemini extraction...")
158
+ gemini_start_time = time.time()
159
+ response = gemini_client.models.generate_content(
160
+ model = genai_model,
161
+ contents=message,
162
+ )
163
+ gemini_end_time = time.time()
164
+ print(f"Time taken to extract from Gemini: { gemini_end_time - gemini_start_time } seconds")
165
+ return response.candidates[0].content, gemini_end_time - gemini_start_time
166
+
167
+ # def get_perplexity_answer( message: str ):
168
+ # print(">>> Starting perplexity extraction...")
169
+ # perplexity_start_time = time.time()
170
+ # messages = [
171
+ # {
172
+ # "role": "system",
173
+ # "content": (
174
+ # "You are an artificial intelligence assistant and you need to "
175
+ # "engage in a helpful, CONCISE, polite question-answer conversation with a user."
176
+ # ),
177
+ # },
178
+ # {
179
+ # "role": "user",
180
+ # "content": (
181
+ # message
182
+ # ),
183
+ # },
184
+ # ]
185
+ # response = perplexity_client.chat.completions.create(
186
+ # model="llama-3.1-sonar-small-128k-online",
187
+ # messages=messages
188
+ # )
189
+ # perplexity_end_time = time.time()
190
+ # print(f"Time taken to extract from Perplexity: { perplexity_end_time - perplexity_start_time } seconds")
191
+ # return response.choices[0].message.content, perplexity_end_time - perplexity_start_time
192
+
193
+ def get_gpt_answer( message: str ):
194
+ print(">>> Starting GPT extraction...")
195
+ gpt_start_time = time.time()
196
+ completion = gpt_client.chat.completions.create(
197
+ model="gpt-4o-mini",
198
+ messages=[
199
+ {"role": "system", "content": "You are a helpful assistant that gives short answers and nothing extra."},
200
+ {
201
+ "role": "user",
202
+ "content": message
203
+ }
204
+ ]
205
+ )
206
+ gpt_end_time = time.time()
207
+ print(f"Time taken to extract from GPT: { gpt_end_time - gpt_start_time } seconds")
208
+ return completion.choices[0].message.content, gpt_end_time - gpt_start_time
209
+
210
+ def compare_answers(message: str):
211
+ methods = [
212
+ ("Qwen Big (72B)", get_qwen_big_answer),
213
+ ("Qwen Small (7B)", get_qwen_small_answer),
214
+ ("Llama Small (8B)", get_llama_small_answer),
215
+ ("Llama Big (70B)", get_llama_big_answer),
216
+ ("Gemini-2.0-Flash", get_gemini_answer),
217
+ # ("Perplexity", get_perplexity_answer),
218
+ ("GPT (4o-mini)", get_gpt_answer)
219
+ ]
220
 
 
 
221
  results = []
222
+
223
+ naver_docs, naver_time_taken = get_naver_answers( message )
224
+ content = f'์•„๋ž˜ ๋ฌธ์„œ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ตํ•˜์„ธ์š”. ๋‹ต๋ณ€์€ ํ•œ๊ตญ์–ด๋กœ๋งŒ ํ•ด์ฃผ์„ธ์š” \n ์งˆ๋ฌธ {message}\n'
225
+ content += naver_docs
226
+ print("Starting the comparison between summarizers...")
227
+ for method_name, method in methods:
228
+ answer, time_taken = method(content)
229
+ results.append({
230
+ "Method": f"Naver + ({method_name})",
231
+ "Question": message,
232
+ "Answer": answer,
233
+ "Time Taken": naver_time_taken + time_taken
234
+ })
235
+
236
+ print("Starting the comparison between extractors/summarizers...")
237
+ for method_name, method in methods:
238
+ additional_docs, time_taken = method(message)
239
+ results.append({
240
+ "Method": method_name,
241
+ "Question": message,
242
+ "Answer": additional_docs,
243
+ "Time Taken": time_taken
244
+ })
245
+ content += f'\n{additional_docs}'
246
+ time_taken += naver_time_taken
247
+ for summarizer_name, summarizer in methods:
248
+ answer, answer_time = summarizer(content)
249
+ results.append({
250
+ "Method": f"Naver + {method_name} + ({summarizer_name})",
251
+ "Question": message,
252
+ "Answer": answer,
253
+ "Time Taken": time_taken + answer_time
254
+ })
255
  return results
256
 
257
+ def chatFunction( message, history ):
258
+ content = f'์•„๋ž˜ ๋ฌธ์„œ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ตํ•˜์„ธ์š”. ๋‹ต๋ณ€์—์„œ ์งˆ๋ฌธ์„ ๋”ฐ๋ผ ์ถœ๋ ฅ ํ•˜์ง€ ๋งˆ์„ธ์š”. ๋‹ต๋ณ€์€ ํ•œ๊ตญ์–ด๋กœ๋งŒ ํ•ด์ฃผ์„ธ์š”! ์ฐพ์€ Naver ๋ฌธ์„œ์™€ ๋‹ค๋ฅธ ๋ฌธ์„œ์—์„œ ๋‹ต๋ณ€์ด ์—†๋Š” ๋‚ด์šฉ์€ ์ ˆ๋Œ€ ์ถœ๋ ฅํ•˜์ง€ ๋งˆ์„ธ์š”. ์นœ์ ˆํ•˜๊ณ  ์ธ๊ฐ„๋‹ต๊ฒŒ ๋งํ•˜์„ธ์š”. \n ์งˆ๋ฌธ: {message}\n ๋ฌธ์„œ: '
259
+ naver_docs, naver_time_taken = get_naver_answers( message )
 
 
260
 
261
+ if len(naver_docs) > 55000:
262
+ overlap = 200
263
+ answers = []
264
+ split_len = len(naver_docs) // ( ( len(naver_docs) - 55000 ) // 55000 + 2 ) + 1
265
+ for i in range( len(naver_docs), split_len ):
266
+ if i == 0:
267
+ split = naver_docs[:split_len]
268
+ else:
269
+ split = naver_docs[i * split_len - overlap: (i + 1) * split_len]
270
+ answer, _ = get_qwen_small_answer(f"Summarize important points in a paragraph, given the information below, using only Korean language. Give me only the summary!!! \n {split}")
271
+ answers.append(answer)
272
+ naver_docs = '\n'.join(answers)
273
 
274
+ start_time = time.time()
275
+ content += "\n Naver ๋ฌธ์„œ: " + naver_docs
276
+
277
+ completion = gpt_client.chat.completions.create(
278
+ model="gpt-4o-mini",
279
+ messages=[
280
+ {"role": "system", "content": "You are a helpful assistant that gives detailed answers only in korean."},
281
+ {
282
+ "role": "user",
283
+ "content": message
284
+ }
285
+ ]
286
+ )
287
+ gpt_resp = completion.choices[0].message.content
288
+ content += "\n ๋‹ค๋ฅธ ๋ฌธ์„œ: " + gpt_resp
289
+
290
+ # content += "\n" + gpt_resp
291
+
292
+ answer, _ = get_qwen_small_answer(content)
293
+
294
+ print("-"*70)
295
+ print("Question: ", message)
296
+ print("Answer: ", answer)
297
+ time_taken = time.time() - start_time
298
+ print("Time taken to summarize: ", time_taken)
299
+ return answer
300
+
301
 
302
+ if __name__ == "__main__":
303
+ # multiprocessing.set_start_method("fork", force=True)
304
+ # if multiprocessing.get_start_method(allow_none=True) is None:
305
+ # multiprocessing.set_start_method("fork")
306
+ with gr.ChatInterface( fn=chatFunction, type="messages" ) as demo: pass
307
+ demo.launch(share=True)
308
+ # with open("test_questions.txt", "r") as f:
309
+ # if os.path.exists("comparison_results.csv"):
310
+ # if input("Do you want to delete the former results? (y/n): ") == "y":
311
+ # os.remove("comparison_results.csv")
312
+ # questions = f.readlines()
313
+ # print(questions)
314
+ # for idx, question in enumerate(questions):
315
+ # print(" -> Starting the question number: ", idx)
316
+ # results = compare_answers(question)
317
+ # df = pd.DataFrame(results)
318
+ # df.to_csv("comparison_results.csv", mode='a', index=False)
319
 
320