Spaces:
Build error
Build error
File size: 2,249 Bytes
d97a6fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fpdf import FPDF
def content_crawler(url, file_format='txt', output_file='privacy_policy'):
# Send an HTTP GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
text = soup.find_all(['h2', 'p', 'i', 'ul'])
# Create output folder if it doesn't exist
if not os.path.exists('../learning_documents'):
os.makedirs('../learning_documents')
# Save content based on the specified file format
output_path = os.path.join('../learning_documents', output_file)
if file_format == 'txt':
with open(f"{output_path}.txt", "w", encoding="utf-8") as file:
for t in text:
file.write(f'{t.text}\n')
print(f"Content saved to {output_path}.txt")
elif file_format == 'pdf':
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", "B", 8)
for t in text:
pdf.cell(0, 10, t.text, ln=True)
pdf.output(f"{output_path}.pdf")
print(f"Content saved to {output_path}.pdf")
elif file_format == 'csv':
df = pd.DataFrame({'Content': [t.text for t in text]})
df.to_csv(f"{output_path}.csv", index=False)
print(f"Content saved to {output_path}.csv")
elif file_format == 'xml':
xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
with open(f"{output_path}.xml", "w", encoding="utf-8") as file:
file.write(f'<root>{xml_content}</root>')
print(f"Content saved to {output_path}.xml")
else:
print("Invalid file format. Supported formats: txt, pdf, csv, xml")
else:
print("Failed to retrieve content from the URL.")
if __name__ == '__main__':
pass
# Example usage:
# content_crawler("https://www.presight.io/privacy-policy.html", file_format='pdf', output_file='privacy_policy')
|