aliceblue11 commited on
Commit
8ec2a9c
·
verified ·
1 Parent(s): 154ebc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -0
app.py CHANGED
@@ -5,43 +5,57 @@ from bs4 import BeautifulSoup
5
  # Function to scrape Naver blog title and content
6
  def scrape_naver_blog(url):
7
  try:
 
 
 
8
  # Send a GET request to the URL
9
  headers = {
10
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
11
  }
12
  response = requests.get(url, headers=headers)
13
  response.raise_for_status() # Raise an exception for HTTP errors
 
14
 
15
  # Parse the HTML using BeautifulSoup
16
  soup = BeautifulSoup(response.text, 'html.parser')
 
17
 
18
  try:
19
  # Extract title
 
20
  title_element = soup.select_one(
21
  "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > p > span"
22
  )
23
  title = title_element.get_text(strip=True) if title_element else "Title not found"
 
24
  except Exception as e:
25
  title = f"Error extracting title: {e}"
 
26
 
27
  try:
28
  # Extract content
 
29
  content_element = soup.select_one(
30
  "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
31
  )
32
  content = content_element.get_text(strip=True) if content_element else "Content not found"
 
33
  except Exception as e:
34
  content = f"Error extracting content: {e}"
 
35
 
36
  # Return the results
37
  return f"제목: {title}\n내용: {content}"
38
 
39
  except Exception as e:
 
40
  return f"Error fetching the page: {e}"
41
 
42
  # Gradio Interface
43
  def scrape_interface(url):
 
44
  result = scrape_naver_blog(url)
 
45
  return result
46
 
47
  interface = gr.Interface(
@@ -53,4 +67,5 @@ interface = gr.Interface(
53
  )
54
 
55
  if __name__ == "__main__":
 
56
  interface.launch(debug=True)
 
5
  # Function to scrape Naver blog title and content
6
  def scrape_naver_blog(url):
7
  try:
8
+ print("Starting the scraping process...") # Debugging
9
+ print(f"Target URL: {url}") # Debugging
10
+
11
  # Send a GET request to the URL
12
  headers = {
13
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
14
  }
15
  response = requests.get(url, headers=headers)
16
  response.raise_for_status() # Raise an exception for HTTP errors
17
+ print("Successfully fetched the page content.") # Debugging
18
 
19
  # Parse the HTML using BeautifulSoup
20
  soup = BeautifulSoup(response.text, 'html.parser')
21
+ print("HTML parsing completed.") # Debugging
22
 
23
  try:
24
  # Extract title
25
+ print("Attempting to extract the title...") # Debugging
26
  title_element = soup.select_one(
27
  "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > p > span"
28
  )
29
  title = title_element.get_text(strip=True) if title_element else "Title not found"
30
+ print(f"Extracted Title: {title}") # Debugging
31
  except Exception as e:
32
  title = f"Error extracting title: {e}"
33
+ print(f"Title extraction error: {e}") # Debugging
34
 
35
  try:
36
  # Extract content
37
+ print("Attempting to extract the content...") # Debugging
38
  content_element = soup.select_one(
39
  "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
40
  )
41
  content = content_element.get_text(strip=True) if content_element else "Content not found"
42
+ print(f"Extracted Content: {content}") # Debugging
43
  except Exception as e:
44
  content = f"Error extracting content: {e}"
45
+ print(f"Content extraction error: {e}") # Debugging
46
 
47
  # Return the results
48
  return f"제목: {title}\n내용: {content}"
49
 
50
  except Exception as e:
51
+ print(f"Error fetching the page: {e}") # Debugging
52
  return f"Error fetching the page: {e}"
53
 
54
  # Gradio Interface
55
  def scrape_interface(url):
56
+ print("Scrape interface triggered.") # Debugging
57
  result = scrape_naver_blog(url)
58
+ print("Scrape process completed.") # Debugging
59
  return result
60
 
61
  interface = gr.Interface(
 
67
  )
68
 
69
  if __name__ == "__main__":
70
+ print("Launching Gradio interface...") # Debugging
71
  interface.launch(debug=True)