Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
from langchain.document_loaders import UnstructuredURLLoader | |
from langchain_core.documents.base import Document | |
from urllib.parse import urlparse | |
# url = input("Insert Link That You Want to Scrape:") | |
def scrape_cnn(url): | |
response = requests.get(url) | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
result = soup.find_all(class_="detail-wrap flex gap-4 relative") | |
# Clean up and concatenate the text using a for loop | |
cleaned_text_list = [] | |
for element in result: | |
cleaned_text = element.get_text().replace('\n', '').strip() | |
cleaned_text_list.append(cleaned_text) | |
# Join the cleaned text from the list | |
all_text = " ".join(cleaned_text_list) | |
# # Print or use the cleaned and concatenated text | |
# print(all_text) | |
# # Write the result to a text file | |
# with open("result.txt", "w", encoding="utf-8") as f: | |
# f.write(all_text) | |
return all_text | |
else: | |
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}") | |
def scrape_kompas(url): | |
response = requests.get(url) | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
scripts = soup.find_all('script') | |
for script in scripts: | |
script_text = script.get_text() | |
if "var keywordBrandSafety" in script_text: | |
result = script_text | |
result = result.replace ("var keywordBrandSafety =", "").strip().strip('"').strip('";') | |
return result | |
else: | |
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}") | |
def scrape_detik(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
results = soup.find_all(class_='detail__body-text itp_bodycontent') | |
# Extract and return the text from each element | |
cleaned_text_list = [] | |
for element in results: | |
text = element.get_text().replace('\n', '').strip() | |
cleaned_text_list.append(text) | |
# Join the cleaned text from the list | |
all_text = " ".join(cleaned_text_list) | |
return all_text | |
else: | |
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}") | |
def document_instance(link, content): | |
document_instance = Document( | |
metadata= {'source':link}, | |
page_content=content | |
) | |
return document_instance | |
def scrape_cnn_instance(url): | |
content = scrape_cnn(url) | |
return (document_instance(url, content)) | |
def scrape_kompas_instance(url): | |
content = scrape_kompas(url) | |
return (document_instance(url, content)) | |
def scrape_detik_instance(url): | |
content = scrape_detik(url) | |
return (document_instance(url, content)) | |
def scraping_pipeline(links:list): | |
result = [] | |
for link in links: | |
parsed_url = urlparse(link) | |
domain = parsed_url.netloc | |
# filter for detik links | |
if "detik.com" in domain: | |
result.append(scrape_detik_instance(link)) | |
# filter for cnn | |
elif "cnnindonesia.com" in domain: | |
result.append(scrape_cnn_instance(link)) | |
# filter for kompas | |
elif "kompas.com" in domain: | |
result.append(scrape_kompas_instance(link)) | |
else: | |
print(f"Failed to retrieve the webpage. because your domain was {domain}") | |
return result | |
def langchain_url(url): | |
loader = UnstructuredURLLoader([url]) | |
data = loader.load() | |
return data | |
links = [ | |
'https://www.cnnindonesia.com/ekonomi/20231221152333-78-1040259/rupiah-merosot-ke-rp15525-jelang-rilis-data-inflasi-as', | |
'https://www.cnnindonesia.com/olahraga/20231221131224-142-1040147/mohamed-salah-vs-arsenal-tajam-dan-lebih-sering-menang', | |
'https://finance.detik.com/infrastruktur/d-7101502/ini-bocoran-konglomerat-yang-bakal-susul-aguan-cs-investasi-di-ikn' | |
] | |
if __name__ == "__main__": | |
print(scraping_pipeline(links =links)) | |