limitedonly41 commited on
Commit
bc11de6
·
verified ·
1 Parent(s): a445b99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -301
app.py CHANGED
@@ -7,7 +7,16 @@ import pandas as pd
7
  from tqdm import tqdm
8
  import urllib
9
  from bs4 import BeautifulSoup
 
10
 
 
 
 
 
 
 
 
 
11
  # Configure logging to write messages to a file
12
  logging.basicConfig(filename='app.log', level=logging.ERROR)
13
 
@@ -24,82 +33,141 @@ peft_model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
24
  model = None
25
  tokenizer = None
26
 
27
- def fetch_data(url):
28
- headers = {
29
- 'Accept': '*/*',
30
- 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
31
- 'Connection': 'keep-alive',
32
- 'Referer': f'{url}',
33
- 'Sec-Fetch-Dest': 'empty',
34
- 'Sec-Fetch-Mode': 'cors',
35
- 'Sec-Fetch-Site': 'cross-site',
36
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
37
- 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
38
- 'sec-ch-ua-mobile': '?0',
39
- 'sec-ch-ua-platform': '"macOS"',
 
 
 
40
  }
41
 
42
- encoding = 'utf-8'
43
- timeout = 10 # Set your desired timeout value in seconds
44
- try:
45
- # Make the request using urllib
46
- req = urllib.request.Request(url, headers=headers)
47
- with urllib.request.urlopen(req, timeout=timeout) as response:
48
- response_content = response.read()
49
-
50
- soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
51
-
52
- title = soup.find('title').text
53
- description = soup.find('meta', attrs={'name': 'description'})
54
- description = description.get("content") if description and "content" in description.attrs else ""
55
-
56
- keywords = soup.find('meta', attrs={'name': 'keywords'})
57
- keywords = keywords.get("content") if keywords and "content" in keywords.attrs else ""
58
-
59
- h1_all = ". ".join(h.text for h in soup.find_all('h1'))
60
- paragraphs_all = ". ".join(p.text for p in soup.find_all('p'))
61
- h2_all = ". ".join(h.text for h in soup.find_all('h2'))
62
- h3_all = ". ".join(h.text for h in soup.find_all('h3'))
63
-
64
- allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"[:4999]
65
-
66
- # Clean up the text
67
- h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
68
- h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
69
- h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
70
-
71
- return {
72
- 'url': url,
73
- 'title': title,
74
- 'description': description,
75
- 'keywords': keywords,
76
- 'h1': h1_all,
77
- 'h2': h2_all,
78
- 'h3': h3_all,
79
- 'paragraphs': paragraphs_all,
80
- 'text': allthecontent
81
- }
82
- except Exception as e:
83
- print(url, e)
84
- return {
85
- 'url': url,
86
- 'title': None,
87
- 'description': None,
88
- 'keywords': None,
89
- 'h1': None,
90
- 'h2': None,
91
- 'h3': None,
92
- 'paragraphs': None,
93
- 'text': None
94
- }
95
-
96
- def main(urls):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  results = []
98
- for url in tqdm(urls):
99
- result = fetch_data(url)
100
- results.append(result)
101
  return results
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  @spaces.GPU()
105
  def classify_website(url):
@@ -118,30 +186,79 @@ def classify_website(url):
118
  )
119
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
120
 
121
-
122
  urls = [url]
123
- results_shop = main(urls)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- # Convert results to DataFrame
126
- df_result_train_more = pd.DataFrame(results_shop)
127
- text = df_result_train_more['text'][0]
128
- translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
129
 
130
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
131
 
132
- ### Instruction:
133
- Describe the website text into one word topic:
134
 
135
- ### Input:
136
- {}
137
 
138
- ### Response:
139
- """
140
-
141
- prompt = alpaca_prompt.format(translated)
142
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
145
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
146
  final_answer = summary.split("### Response:")[1].strip()
147
  return final_answer
@@ -158,217 +275,3 @@ iface = gr.Interface(
158
  # Launch the interface
159
  iface.launch()
160
 
161
-
162
- # import gradio as gr
163
- # import asyncio
164
- # import requests
165
- # from bs4 import BeautifulSoup
166
- # import pandas as pd
167
- # from tqdm import tqdm
168
- # import urllib
169
- # from deep_translator import GoogleTranslator
170
- # import spaces
171
-
172
-
173
- # # from unsloth import FastLanguageModel
174
- # import torch
175
- # import re
176
-
177
-
178
-
179
- # # Define helper functions
180
- # async def fetch_data(url):
181
- # headers = {
182
- # 'Accept': '*/*',
183
- # 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
184
- # 'Connection': 'keep-alive',
185
- # 'Referer': f'{url}',
186
- # 'Sec-Fetch-Dest': 'empty',
187
- # 'Sec-Fetch-Mode': 'cors',
188
- # 'Sec-Fetch-Site': 'cross-site',
189
- # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
190
- # 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
191
- # 'sec-ch-ua-mobile': '?0',
192
- # 'sec-ch-ua-platform': '"macOS"',
193
- # }
194
-
195
- # encoding = 'utf-8'
196
- # timeout = 10
197
-
198
- # try:
199
- # def get_content():
200
- # req = urllib.request.Request(url, headers=headers)
201
- # with urllib.request.urlopen(req, timeout=timeout) as response:
202
- # return response.read()
203
-
204
- # response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)
205
-
206
- # soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
207
-
208
- # title = soup.find('title').text
209
- # description = soup.find('meta', attrs={'name': 'description'})
210
- # if description and "content" in description.attrs:
211
- # description = description.get("content")
212
- # else:
213
- # description = ""
214
-
215
- # keywords = soup.find('meta', attrs={'name': 'keywords'})
216
- # if keywords and "content" in keywords.attrs:
217
- # keywords = keywords.get("content")
218
- # else:
219
- # keywords = ""
220
-
221
- # h1_all = " ".join(h.text for h in soup.find_all('h1'))
222
- # h2_all = " ".join(h.text for h in soup.find_all('h2'))
223
- # h3_all = " ".join(h.text for h in soup.find_all('h3'))
224
- # paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
225
-
226
- # allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
227
- # allthecontent = allthecontent[:4999]
228
-
229
- # return {
230
- # 'url': url,
231
- # 'title': title,
232
- # 'description': description,
233
- # 'keywords': keywords,
234
- # 'h1': h1_all,
235
- # 'h2': h2_all,
236
- # 'h3': h3_all,
237
- # 'paragraphs': paragraphs_all,
238
- # 'text': allthecontent
239
- # }
240
- # except Exception as e:
241
- # return {
242
- # 'url': url,
243
- # 'title': None,
244
- # 'description': None,
245
- # 'keywords': None,
246
- # 'h1': None,
247
- # 'h2': None,
248
- # 'h3': None,
249
- # 'paragraphs': None,
250
- # 'text': None
251
- # }
252
-
253
- # def concatenate_text(data):
254
- # text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
255
- # text = ' '.join(text_parts)
256
- # text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
257
- # text = re.sub(r'\s{2,}', ' ', text)
258
- # return text
259
-
260
- # def translate_text(text):
261
- # try:
262
- # text = text[:4990]
263
- # translated_text = GoogleTranslator(source='auto', target='en').translate(text)
264
- # return translated_text
265
- # except Exception as e:
266
- # print(f"An error occurred during translation: {e}")
267
- # return None
268
-
269
-
270
- # model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
271
-
272
- # # Initialize model and tokenizer variables
273
- # model = None
274
- # tokenizer = None
275
-
276
- # @spaces.GPU()
277
- # def summarize_url(url):
278
-
279
- # global model, tokenizer # Declare model and tokenizer as global variables
280
-
281
- # # Load the model
282
- # max_seq_length = 2048
283
- # dtype = None
284
- # load_in_4bit = True
285
-
286
- # if model is None or tokenizer is None:
287
- # from unsloth import FastLanguageModel
288
-
289
- # # Load the model and tokenizer
290
- # model, tokenizer = FastLanguageModel.from_pretrained(
291
- # model_name=model_name, # YOUR MODEL YOU USED FOR TRAINING
292
- # max_seq_length=max_seq_length,
293
- # dtype=dtype,
294
- # load_in_4bit=load_in_4bit,
295
- # )
296
- # FastLanguageModel.for_inference(model) # Enable native 2x faster inference
297
-
298
-
299
- # result = asyncio.run(fetch_data(url))
300
- # text = concatenate_text(result)
301
- # translated_text = translate_text(text)
302
- # if len(translated_text) < 100:
303
- # return 'not scraped or short text'
304
- # alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
305
-
306
- # ### Instruction:
307
- # Describe the website text into one word topic:
308
-
309
- # ### Input:
310
- # {}
311
-
312
- # ### Response:
313
- # """
314
-
315
- # prompt = alpaca_prompt.format(translated_text)
316
- # inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
317
-
318
- # outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
319
- # summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
320
- # final_answer = summary.split("### Response:")[1].strip()
321
- # return final_answer
322
-
323
-
324
- # # # Create the Gradio interface within a `Blocks` context, like the working example
325
- # # with gr.Blocks() as demo:
326
-
327
- # # # Add title and description to the interface
328
- # # gr.HTML("<h1>Website Summary Generator</h1>")
329
- # # gr.HTML("<p>Enter a URL to get a one-word topic summary of the website content..</p>")
330
-
331
- # # # Define input and output elements
332
- # # with gr.Row():
333
- # # prompt = gr.Textbox(label="Enter Website URL", placeholder="https://example.com")
334
- # # output_text = gr.Textbox(label="Topic", interactive=False)
335
-
336
- # # # Add the button to trigger the function
337
- # # submit = gr.Button("Classify")
338
-
339
- # # # Define the interaction between inputs and outputs
340
- # # submit.click(fn=summarize_url, inputs=prompt, outputs=output_text)
341
-
342
- # # # Add the `if __name__ == "__main__":` block to launch the interface
343
- # # if __name__ == "__main__":
344
- # # demo.launch()
345
-
346
-
347
- # # with gr as demo:
348
- # # # Define Gradio interface
349
- # # demo = demo.Interface(
350
- # # fn=summarize_url,
351
- # # inputs="text",
352
- # # outputs="text",
353
- # # title="Website Summary Generator",
354
- # # description="Enter a URL to get a one-word topic summary of the website content."
355
- # # )
356
-
357
-
358
- # # if __name__ == "__main__":
359
- # # demo.launch()
360
-
361
-
362
-
363
- # # Create a Gradio interface
364
- # iface = gr.Interface(
365
- # fn=summarize_url,
366
- # inputs="text",
367
- # outputs="text",
368
- # title="Website Summary Generator",
369
- # description="Enter a URL to get a one-word topic summary of the website content."
370
- # )
371
-
372
- # # Launch the interface
373
- # iface.launch()
374
-
 
7
  from tqdm import tqdm
8
  import urllib
9
  from bs4 import BeautifulSoup
10
+ import asyncio
11
 
12
+ from curl_cffi.requests import AsyncSession
13
+ from tqdm.asyncio import tqdm
14
+ from fake_headers import Headers
15
+
16
+
17
+ # Limit the number of concurrent workers
18
+ CONCURRENT_WORKERS = 5
19
+ semaphore = asyncio.Semaphore(CONCURRENT_WORKERS)
20
  # Configure logging to write messages to a file
21
  logging.basicConfig(filename='app.log', level=logging.ERROR)
22
 
 
33
  model = None
34
  tokenizer = None
35
 
36
+
37
+
38
+
39
+ async def get_page_bs4(url: str, headers):
40
+
41
+ wrong_result = {
42
+ 'url': None,
43
+ 'title': None,
44
+ 'description': None,
45
+ 'keywords': None,
46
+ 'h1': None,
47
+ 'h2': None,
48
+ 'h3': None,
49
+ 'paragraphs': None,
50
+ 'text': None,
51
+ 'links': None
52
  }
53
 
54
+ async with semaphore: # Limit concurrency
55
+ async with AsyncSession() as session:
56
+
57
+ wrong_result['url'] = url
58
+
59
+ try:
60
+ response = await session.get(url, headers=headers, impersonate="chrome", timeout=60, verify=False)
61
+ except:
62
+ try:
63
+ response = await session.get(url, impersonate="chrome", timeout=60, verify=False)
64
+ except:
65
+ return wrong_result
66
+
67
+ if response.status_code != 200:
68
+ return wrong_result
69
+ soup = BeautifulSoup(response.text, "html.parser")
70
+
71
+ try:
72
+ title = soup.find('title').text if soup.find('title') else ''
73
+ except:
74
+ title = ''
75
+ try:
76
+ description = soup.find('meta', attrs={'name': 'description'})
77
+ description = description.get("content") if description else ''
78
+ except:
79
+ description = ''
80
+ try:
81
+ keywords = soup.find('meta', attrs={'name': 'keywords'})
82
+ keywords = keywords.get("content") if keywords else ''
83
+ except:
84
+ keywords = ''
85
+ try:
86
+ h1 = " ".join(h.text for h in soup.find_all('h1'))
87
+ except:
88
+ h1 = ''
89
+ try:
90
+ h2 = " ".join(h.text for h in soup.find_all('h2'))
91
+ except:
92
+ h2 = ''
93
+ try:
94
+ h3 = " ".join(h.text for h in soup.find_all('h3'))
95
+ except:
96
+ h3 = ''
97
+ try:
98
+ paragraphs = " ".join(p.text for p in soup.find_all('p'))
99
+ except:
100
+ paragraphs = ''
101
+ try:
102
+ menu_tags = []
103
+ navs = soup.find_all('nav')
104
+ uls = soup.find_all('ul')
105
+ ols = soup.find_all('ol')
106
+ for tag in navs + uls + ols:
107
+ menu_tags.extend(tag.find_all('a'))
108
+ menu_items = [{'text': tag.get_text(strip=True), 'href': tag.get('href')} for tag in menu_tags if tag.get_text(strip=True)]
109
+ all_menu_texts = ', '.join([item['text'] for item in menu_items])
110
+ except:
111
+ all_menu_texts = ''
112
+
113
+ # all_content = f"{url} {title} {description} {h1} {h2} {h3} {paragraphs}"[:4999]
114
+
115
+ all_content = f" {url} {title} {description} {h1} {h2} {h3} {paragraphs} "[:4999]
116
+
117
+ if len(all_content) < 150:
118
+ all_content = f" {url} {title} {description} {h1} {h2} {h3} {paragraphs} {all_menu_texts}"[:4999]
119
+
120
+
121
+ # all_content = f" {url} {title} {description} {keywords} {h1} {h2} {h3} {paragraphs} "[:4999]
122
+
123
+ # all_content = f" url: {url} title: {title} description: {description} keywords: {keywords} h1: {h1} h2: {h2} h3: {h3} p: {paragraphs} links: {all_menu_texts}"[:4999]
124
+
125
+
126
+ result = {
127
+ 'url': url,
128
+ 'title': title,
129
+ 'description': description,
130
+ 'keywords': keywords,
131
+ 'h1': h1,
132
+ 'h2': h2,
133
+ 'h3': h3,
134
+ 'paragraphs': paragraphs,
135
+ 'text': all_content,
136
+ 'links': all_menu_texts
137
+ }
138
+
139
+ return result
140
+
141
+
142
+ async def main(urls_list):
143
+
144
+ headers_list = [Headers(browser="chrome", os="win").generate() for _ in range(len(urls_list) // 5 + 1)]
145
+ tasks = []
146
+
147
+ # Assign headers to each task, rotating every 5 URLs
148
+ for i, url in enumerate(urls_list):
149
+ headers = headers_list[i // 5] # Rotate headers every 5 URLs
150
+ tasks.append(get_page_bs4(url, headers))
151
+
152
+ # Use tqdm to show progress
153
  results = []
154
+ for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
155
+ results.append(await coro)
 
156
  return results
157
 
158
+ def scrape_websites(urls_list):
159
+
160
+ try:
161
+ import nest_asyncio
162
+ nest_asyncio.apply()
163
+ loop = asyncio.get_event_loop()
164
+ result_data = loop.run_until_complete(main(urls_list))
165
+ # print(len(result_data))
166
+ except RuntimeError:
167
+ result_data = asyncio.run(main(urls_list))
168
+
169
+ return result_data
170
+
171
 
172
  @spaces.GPU()
173
  def classify_website(url):
 
186
  )
187
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
188
 
189
+
190
  urls = [url]
191
+
192
+ final_ans_dict = {}
193
+ print('before scrape_websites')
194
+ result_data = scrape_websites(urls)
195
+
196
+ data = result_data[0]
197
+
198
+ url = data['url']
199
+ text = data['text']
200
+
201
+ try:
202
+ if len(text) < 150:
203
+ # print('Short ', text)
204
+ prediction = 'Short'
205
+ final_ans_dict[url] = prediction
206
+ except:
207
+ # print(translated)
208
+ prediction = 'NotScraped'
209
+ final_ans_dict[url] = prediction
210
+
211
+ translated = translate_text(text)
212
+
213
+ # print(translated)
214
+ try:
215
+ if len(translated) < 150:
216
+ # print(translated)
217
+ pred = 'Short'
218
+ return pred
219
+ except:
220
+ # print(translated)
221
+ pred = 'NotScraped'
222
+ return pred
223
+
224
+
225
+ example_input = """https://extensionesdepelo.net/ Hair extensions in Valencia ▶ The best prices for natural hair extensions in Valencia Hair Extensions in Valencia ▶ Professional and Natural ⭐ Hair with more volume and length. Perfect Hair Extensions About us Our works Our salon services Hair extensions Hair removal Reviews of satisfied customers Hair palette colors Contacts Fill out the form Over 7 years of experience in hair extensions, we select the color and texture of hair to match your hair so that the hair extensions look natural Gentle and safe hair extensions so that your hair does not suffer. In a few hours, we will transform rare, weak and short hair into luxurious long and healthy hair. We work exclusively with high-quality hair. Thanks to micro and nano capsules, the extensions will be invisible and comfortable. Free consultation before each extension. We use high-quality hair, time-tested
226
+
227
+ We use small, neat, comfortable
228
+ capsules and make an unnoticeable transition
229
+ We consult
230
+ and answer all
231
+ questions before and after extensions
232
+ Safe extensions without discomfort in wearing. Due to the correct placement of the capsules, the result of the extension is invisible.  A procedure that requires the attention and accuracy of the master. With proper hair removal, the structure of native hair is not damaged We provide a large selection of colors Ask the master a question and we will answer all your questions We work in the hot Italian extension technique. This technique is the most comfortable because it does not require much self-care. We recommend doing a correction every 2-3 months. With the Italian technique, you can do various hairstyles and even make ponytails. To form capsules, we use good refractory keratin.  We work with a proven supplier of natural Slavic hair. We have a large selection of colors, lengths and hair structures."""
233
+
234
 
 
 
 
 
235
 
236
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
237
 
238
+ ### Instruction:
239
+ Describe the topic of website from its text :
240
 
241
+ ### ExampleInput:
242
+ {}
243
 
244
+ ### ExampleResponse: The website of the master of hair extensions.
245
+
246
+ ### Input:
247
+ {}
248
+
249
+ ### Response:"""
250
+
251
+ prompt = alpaca_prompt.format(example_input,translated)
252
+
253
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
254
+
255
+ with autocast(device_type='cuda'):
256
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
257
+ outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
258
+
259
+ # inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
260
+ # outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
261
 
 
262
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
263
  final_answer = summary.split("### Response:")[1].strip()
264
  return final_answer
 
275
  # Launch the interface
276
  iface.launch()
277