LuckRafly's picture
Upload 4 files
df7271b
raw
history blame
4.25 kB
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders import UnstructuredURLLoader
from langchain_core.documents.base import Document
from urllib.parse import urlparse
# url = input("Insert Link That You Want to Scrape:")
def scrape_cnn(url):
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find_all(class_="detail-wrap flex gap-4 relative")
# Clean up and concatenate the text using a for loop
cleaned_text_list = []
for element in result:
cleaned_text = element.get_text().replace('\n', '').strip()
cleaned_text_list.append(cleaned_text)
# Join the cleaned text from the list
all_text = " ".join(cleaned_text_list)
# # Print or use the cleaned and concatenated text
# print(all_text)
# # Write the result to a text file
# with open("result.txt", "w", encoding="utf-8") as f:
# f.write(all_text)
return all_text
else:
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
def scrape_kompas(url):
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
script_text = script.get_text()
if "var keywordBrandSafety" in script_text:
result = script_text
result = result.replace ("var keywordBrandSafety =", "").strip().strip('"').strip('";')
return result
else:
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
def scrape_detik(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all(class_='detail__body-text itp_bodycontent')
# Extract and return the text from each element
cleaned_text_list = []
for element in results:
text = element.get_text().replace('\n', '').strip()
cleaned_text_list.append(text)
# Join the cleaned text from the list
all_text = " ".join(cleaned_text_list)
return all_text
else:
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
def document_instance(link, content):
document_instance = Document(
metadata= {'source':link},
page_content=content
)
return document_instance
def scrape_cnn_instance(url):
content = scrape_cnn(url)
return (document_instance(url, content))
def scrape_kompas_instance(url):
content = scrape_kompas(url)
return (document_instance(url, content))
def scrape_detik_instance(url):
content = scrape_detik(url)
return (document_instance(url, content))
def scraping_pipeline(links:list):
result = []
for link in links:
parsed_url = urlparse(link)
domain = parsed_url.netloc
# filter for detik links
if "detik.com" in domain:
result.append(scrape_detik_instance(link))
# filter for cnn
elif "cnnindonesia.com" in domain:
result.append(scrape_cnn_instance(link))
# filter for kompas
elif "kompas.com" in domain:
result.append(scrape_kompas_instance(link))
else:
print(f"Failed to retrieve the webpage. because your domain was {domain}")
return result
def langchain_url(url):
loader = UnstructuredURLLoader([url])
data = loader.load()
return data
links = [
'https://www.cnnindonesia.com/ekonomi/20231221152333-78-1040259/rupiah-merosot-ke-rp15525-jelang-rilis-data-inflasi-as',
'https://www.cnnindonesia.com/olahraga/20231221131224-142-1040147/mohamed-salah-vs-arsenal-tajam-dan-lebih-sering-menang',
'https://finance.detik.com/infrastruktur/d-7101502/ini-bocoran-konglomerat-yang-bakal-susul-aguan-cs-investasi-di-ikn'
]
if __name__ == "__main__":
print(scraping_pipeline(links =links))