anisrashidov commited on
Commit
3e01798
·
verified ·
1 Parent(s): 84d76e8

Update crawler.py

Browse files
Files changed (1) hide show
  1. crawler.py +91 -310
crawler.py CHANGED
@@ -1,320 +1,101 @@
1
- # from fastapi import FastAPI
2
- # from fastapi.middleware.cors import CORSMiddleware
3
- from openai import OpenAI
4
- from google import genai
5
- from crawler import extract_data
6
- import time
7
- import os
8
- from dotenv import load_dotenv
9
- import gradio as gr
10
- # import multiprocessing
11
- from together import Together
12
-
13
- load_dotenv("../.env")
14
- # print("Environment variables:", os.environ)
15
-
16
-
17
- together_client = Together(
18
- api_key=os.getenv("TOGETHER_API_KEY"),
19
- )
20
-
21
- gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
22
- genai_model = "gemini-2.0-flash-exp"
23
-
24
- perplexity_client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai")
25
- gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
26
-
27
-
28
-
29
- def get_answers( query: str ):
30
- context = extract_data(query, 1)
31
- return context
32
-
33
- # with torch.no_grad():
34
- # model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta')
35
- # tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta', TOKENIZERS_PARALLELISM=True)
36
-
37
- # def cal_score(input_data):
38
- # # Initialize model and tokenizer inside the function
39
- # with torch.no_grad():
40
- # inputs = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt")
41
- # outputs = model.get_input_embeddings(inputs["input_ids"])
42
-
43
- # a, b = outputs[0], outputs[1] # Adjust based on your model's output structure
44
-
45
- # # Normalize the tensors
46
- # a_norm = a / a.norm(dim=1)[:, None]
47
- # b_norm = b / b.norm(dim=1)[:, None]
48
-
49
- # print(a.shape, b.shape)
50
-
51
- # # Return the similarity score
52
- # # return torch.mm(a_norm, b_norm.transpose(0, 1)) * 100
53
- # a_norm = a_norm.reshape(1, -1)
54
- # b_norm = b_norm.reshape(1, -1)
55
- # similarity_score = cosine_similarity(a_norm, b_norm)
56
-
57
- # # Return the similarity score (assuming you want the average of the similarities across the tokens)
58
- # return similarity_score # Scalar value
59
-
60
-
61
-
62
- # def get_match_scores( message: str, query: str, answers: list[dict[str, object]] ):
63
- # start = time.time()
64
- # max_processes = 4
65
- # with multiprocessing.Pool(processes=max_processes) as pool:
66
- # scores = pool.map(cal_score, [[answer['questionDetails'], message] for answer in answers])
67
- # print(f"Time taken to compare: {time.time() - start} seconds")
68
- # print("Scores: ", scores)
69
- # return scores
70
-
71
- def get_naver_answers( message: str ):
72
- print(">>> Starting naver extraction...")
73
- print("Question: ", message)
74
- naver_start_time = time.time()
75
- response = gemini_client.models.generate_content(
76
- model = genai_model,
77
- contents=f"{message}\n 위의 내용을 짧은 제목으로 요약합니다. 제목만 보여주세요. 대답하지 마세요. 한국어로만 답변해주세요!!!",
78
- )
79
- query = response.text
80
- print( "Query: ", query)
81
-
82
- context = get_answers( query )
83
-
84
- sorted_answers = ['. '.join(answer['answers']) for answer in context]
85
- naver_end_time = time.time()
86
- print(f"Time taken to extract from Naver: { naver_end_time - naver_start_time } seconds")
87
- document = '\n'.join(sorted_answers)
88
- return document, naver_end_time - naver_start_time
89
-
90
- def get_qwen_big_answer( message: str ):
91
- print(">>> Starting Qwen 72B extraction...")
92
- qwen_start_time = time.time()
93
- response = together_client.chat.completions.create(
94
- model="Qwen/Qwen2.5-72B-Instruct-Turbo",
95
- messages=[
96
- {"role": "system", "content": "You are a helpful question-answer, CONCISE conversation assistant that answers in Korean."},
97
- {"role": "user", "content": message}
98
- ]
99
- )
100
-
101
- qwen_end_time = time.time()
102
- print(f"Time taken to extract from Qwen: { qwen_end_time - qwen_start_time } seconds")
103
- return response.choices[0].message.content, qwen_end_time - qwen_start_time
104
-
105
- def get_qwen_small_answer( message: str ):
106
- print(">>> Starting Qwen 7B extraction...")
107
- qwen_start_time = time.time()
108
- response = together_client.chat.completions.create(
109
- model="Qwen/Qwen2.5-7B-Instruct-Turbo",
110
- messages=[
111
- {"role": "system", "content": "You are a helpful question-answer, conversation assistant that answers in Korean. Your responses should sound human-like."},
112
- {"role": "user", "content": message}
113
- ],
114
- max_tokens = None
115
- #TODO: Change the messages option
116
- )
117
- qwen_end_time = time.time()
118
- print(f"Time taken to extract from Qwen: { qwen_end_time - qwen_start_time } seconds")
119
- return response.choices[0].message.content, qwen_end_time - qwen_start_time
120
-
121
- def get_llama_small_answer( message: str ):
122
- print(">>> Starting Llama 3.1 8B extraction...")
123
- llama_start_time = time.time()
124
- response = together_client.chat.completions.create(
125
- model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
126
- messages=[
127
- {"role": "system", "content": "You are an artificial intelligence assistant and you need to engage in a helpful, CONCISE, polite question-answer conversation with a user."},
128
- {
129
- "role": "user",
130
- "content": message
131
- }
132
- ]
133
- )
134
- llama_end_time = time.time()
135
- print(f"Time taken to extract from Llama: { llama_end_time - llama_start_time } seconds")
136
- return response.choices[0].message.content, llama_end_time - llama_start_time
137
-
138
- def get_llama_big_answer( message: str ):
139
- print(">>> Starting Llama 3.1 70B extraction...")
140
- llama_start_time = time.time()
141
- response = together_client.chat.completions.create(
142
- model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
143
- messages=[
144
- {"role": "system", "content": "You are an artificial intelligence assistant and you need to engage in a helpful, CONCISE, polite question-answer conversation with a user."},
145
- {
146
- "role": "user",
147
- "content": message
148
- }
149
- ]
150
- )
151
- llama_end_time = time.time()
152
- print(f"Time taken to extract from Llama: { llama_end_time - llama_start_time } seconds")
153
- return response.choices[0].message.content, llama_end_time - llama_start_time
154
-
155
-
156
- def get_gemini_answer( message: str ):
157
- print(">>> Starting gemini extraction...")
158
- gemini_start_time = time.time()
159
- response = gemini_client.models.generate_content(
160
- model = genai_model,
161
- contents=message,
162
- )
163
- gemini_end_time = time.time()
164
- print(f"Time taken to extract from Gemini: { gemini_end_time - gemini_start_time } seconds")
165
- return response.candidates[0].content, gemini_end_time - gemini_start_time
166
-
167
- # def get_perplexity_answer( message: str ):
168
- # print(">>> Starting perplexity extraction...")
169
- # perplexity_start_time = time.time()
170
- # messages = [
171
- # {
172
- # "role": "system",
173
- # "content": (
174
- # "You are an artificial intelligence assistant and you need to "
175
- # "engage in a helpful, CONCISE, polite question-answer conversation with a user."
176
- # ),
177
- # },
178
- # {
179
- # "role": "user",
180
- # "content": (
181
- # message
182
- # ),
183
- # },
184
- # ]
185
- # response = perplexity_client.chat.completions.create(
186
- # model="llama-3.1-sonar-small-128k-online",
187
- # messages=messages
188
- # )
189
- # perplexity_end_time = time.time()
190
- # print(f"Time taken to extract from Perplexity: { perplexity_end_time - perplexity_start_time } seconds")
191
- # return response.choices[0].message.content, perplexity_end_time - perplexity_start_time
192
-
193
- def get_gpt_answer( message: str ):
194
- print(">>> Starting GPT extraction...")
195
- gpt_start_time = time.time()
196
- completion = gpt_client.chat.completions.create(
197
- model="gpt-4o-mini",
198
- messages=[
199
- {"role": "system", "content": "You are a helpful assistant that gives short answers and nothing extra."},
200
- {
201
- "role": "user",
202
- "content": message
203
- }
204
- ]
205
- )
206
- gpt_end_time = time.time()
207
- print(f"Time taken to extract from GPT: { gpt_end_time - gpt_start_time } seconds")
208
- return completion.choices[0].message.content, gpt_end_time - gpt_start_time
209
-
210
- def compare_answers(message: str):
211
- methods = [
212
- ("Qwen Big (72B)", get_qwen_big_answer),
213
- ("Qwen Small (7B)", get_qwen_small_answer),
214
- ("Llama Small (8B)", get_llama_small_answer),
215
- ("Llama Big (70B)", get_llama_big_answer),
216
- ("Gemini-2.0-Flash", get_gemini_answer),
217
- # ("Perplexity", get_perplexity_answer),
218
- ("GPT (4o-mini)", get_gpt_answer)
219
- ]
220
 
221
  results = []
222
-
223
- naver_docs, naver_time_taken = get_naver_answers( message )
224
- content = f'아래 문서를 바탕으로 질문에 답하세요. 답변은 한국어로만 해주세요 \n 질문 {message}\n'
225
- content += naver_docs
226
- print("Starting the comparison between summarizers...")
227
- for method_name, method in methods:
228
- answer, time_taken = method(content)
229
- results.append({
230
- "Method": f"Naver + ({method_name})",
231
- "Question": message,
232
- "Answer": answer,
233
- "Time Taken": naver_time_taken + time_taken
234
- })
235
-
236
- print("Starting the comparison between extractors/summarizers...")
237
- for method_name, method in methods:
238
- additional_docs, time_taken = method(message)
239
- results.append({
240
- "Method": method_name,
241
- "Question": message,
242
- "Answer": additional_docs,
243
- "Time Taken": time_taken
244
- })
245
- content += f'\n{additional_docs}'
246
- time_taken += naver_time_taken
247
- for summarizer_name, summarizer in methods:
248
- answer, answer_time = summarizer(content)
249
- results.append({
250
- "Method": f"Naver + {method_name} + ({summarizer_name})",
251
- "Question": message,
252
- "Answer": answer,
253
- "Time Taken": time_taken + answer_time
254
- })
255
  return results
256
 
257
- def chatFunction( message, history ):
258
- content = f'아래 문서를 바탕으로 질문에 답하세요. 답변에서 질문을 따라 출력 하지 마세요. 답변은 한국어로만 해주세요! 찾은 Naver 문서와 다른 문서에서 답변이 없는 내용은 절대 출력하지 마세요. 친절하고 인간답게 말하세요. \n 질문: {message}\n 문서: '
259
- naver_docs, naver_time_taken = get_naver_answers( message )
260
-
261
- if len(naver_docs) > 55000:
262
- overlap = 200
263
- answers = []
264
- split_len = len(naver_docs) // ( ( len(naver_docs) - 55000 ) // 55000 + 2 ) + 1
265
- for i in range( len(naver_docs), split_len ):
266
- if i == 0:
267
- split = naver_docs[:split_len]
268
- else:
269
- split = naver_docs[i * split_len - overlap: (i + 1) * split_len]
270
- answer, _ = get_qwen_small_answer(f"Summarize important points in a paragraph, given the information below, using only Korean language. Give me only the summary!!! \n {split}")
271
- answers.append(answer)
272
- naver_docs = '\n'.join(answers)
273
-
274
- start_time = time.time()
275
- content += "\n Naver 문서: " + naver_docs
276
-
277
- completion = gpt_client.chat.completions.create(
278
- model="gpt-4o-mini",
279
- messages=[
280
- {"role": "system", "content": "You are a helpful assistant that gives detailed answers only in korean."},
281
- {
282
- "role": "user",
283
- "content": message
284
- }
285
- ]
286
- )
287
- gpt_resp = completion.choices[0].message.content
288
- content += "\n 다른 문서: " + gpt_resp
289
 
290
- # content += "\n" + gpt_resp
 
 
 
 
291
 
292
- answer, _ = get_qwen_small_answer(content)
 
 
 
 
 
 
293
 
294
- print("-"*70)
295
- print("Question: ", message)
296
- print("Answer: ", answer)
297
- time_taken = time.time() - start_time
298
- print("Time taken to summarize: ", time_taken)
299
- return answer
300
-
301
 
302
- if __name__ == "__main__":
303
- # multiprocessing.set_start_method("fork", force=True)
304
- # if multiprocessing.get_start_method(allow_none=True) is None:
305
- # multiprocessing.set_start_method("fork")
306
- with gr.ChatInterface( fn=chatFunction, type="messages" ) as demo: pass
307
- demo.launch(share=True)
308
- # with open("test_questions.txt", "r") as f:
309
- # if os.path.exists("comparison_results.csv"):
310
- # if input("Do you want to delete the former results? (y/n): ") == "y":
311
- # os.remove("comparison_results.csv")
312
- # questions = f.readlines()
313
- # print(questions)
314
- # for idx, question in enumerate(questions):
315
- # print(" -> Starting the question number: ", idx)
316
- # results = compare_answers(question)
317
- # df = pd.DataFrame(results)
318
- # df.to_csv("comparison_results.csv", mode='a', index=False)
319
 
320
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import re
3
+ import requests as r
4
+ from html2text import html2text
5
+ import tqdm
6
+
7
+ def process_url(url):
8
+ """Process a single URL to fetch answers."""
9
+ try:
10
+ response = r.get(url)
11
+ soup = BeautifulSoup(response.text, "html.parser")
12
+ # answers = []
13
+ # for idx in range(1, 100):
14
+ # answer = soup.find('div', {'id': f'answer_{idx}'})
15
+ # if answer:
16
+ # answers.append(answer)
17
+ # else:
18
+ # break
19
+ answers = soup.find_all('div', {'id': re.compile(r'answer_\d+')})
20
+ answers = [html2text(str(answer.find('div', {'class': "answerDetail"}).prettify()))
21
+ for answer in answers if answer.find('div', {'class': "answerDetail"})]
22
+ title = soup.find('div', {'class': 'endTitleSection'}).text.strip()
23
+ questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip()
24
+ # print("Question: ", questionDetails, '\n')
25
+ title = title.replace("질문", '').strip()
26
+ print("Answers extracted from: \n", url)
27
+ print(len(answers))
28
+ print('-'*60)
29
+ return {
30
+ "title": title,
31
+ "questionDetails": questionDetails,
32
+ "url": url,
33
+ "answers": answers
34
+ }
35
+ except Exception as e:
36
+ print(f"Error processing URL {url}: {e}")
37
+ with open('error_urls.txt', 'w') as f:
38
+ f.write(url + '\n')
39
+ return {"title": '', "questionDetails": '', "url": url, "answers": ''}
40
+
41
+ def get_answers(results_a_elements, query):
42
+ """Fetch answers for all the extracted result links."""
43
+ if not results_a_elements:
44
+ print("No results found.")
45
+ return []
46
+
47
+ print("Result links extracted: ", len(results_a_elements))
48
+
49
+ # Limit the number of parallel processes for better resource management
50
+ # max_processes = 4
51
+
52
+ # with multiprocessing.Pool(processes=max_processes) as pool:
53
+ # results = pool.map(process_url, results_a_elements)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  results = []
56
+ # answer_count = 0
57
+ for url in tqdm.tqdm(results_a_elements):
58
+ res = process_url(url)
59
+ results.append(res)
60
+ # answer_count += len(res['answers'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  return results
62
 
63
+ def get_search_results(query, num_pages):
64
+ """Fetch search results for the given query from Naver 지식in."""
65
+ results = []
66
+ for page in range(1, num_pages + 1):
67
+ url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}"
68
+ print("Starting the scraping process for:\n", url)
69
+
70
+ try:
71
+ response = r.get(url)
72
+ soup = BeautifulSoup(response.text, "html.parser")
73
+ results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"})
74
+ results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")]
75
+ results += results_a_elements
76
+ except Exception as e:
77
+ print(f"Error while fetching search results: {e}")
78
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ def extract_data(query, num_pages=150) -> list[dict[str, object]]:
81
+ results_a_elements = get_search_results(query, num_pages)
82
+ answers = get_answers(results_a_elements, query)
83
+ print("Total answers collected:", len(answers))
84
+ return answers
85
 
86
+ # if __name__ == "__main__":
87
+ # start = time.time()
88
+ # query = "장래희망, 인공지능 개발자/연구원, 파이썬, 중학생 수준, 파이썬 설치, 도서 추천"
89
+ # answers = process_query(query)
90
+ # print("Total answers collected:", len(answers))
91
+ # print("Time taken: ", time.time() - start)
92
+ # # print(answers)
93
 
 
 
 
 
 
 
 
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
 
97
+ # AJAX URL:
98
+ # https://kin.naver.com/ajax/detail/answerList.naver?
99
+ # dirId=401030201&docId=292159869
100
+ # &answerSortType=DEFAULT&answerViewType=DETAIL
101
+ # &answerNo=&page=2&count=5&_=1736131792605