Spaces:

rchrdgwr
/

AI4Midterm

Sleeping

AI4Midterm / utilities /get_documents.py

Add files for midterm project

5a9839d 10 months ago

1.13 kB

	import requests
	import os
	from langchain.document_loaders import PyMuPDFLoader

	# Define the URLs for the documents
	url_1 = "https://example.com/Blueprint-for-an-AI-Bill-of-Rights.pdf"
	url_2 = "https://example.com/NIST.AI.600-1.pdf"

	# Define local file paths for storing the downloaded PDFs
	file_path_1 = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"
	file_path_2 = "data/NIST.AI.600-1.pdf"

	# Function to download a file from a URL
	def download_pdf(url, file_path):
	# Check if the file already exists to avoid re-downloading
	if not os.path.exists(file_path):
	print(f"Downloading {file_path} from {url}...")
	response = requests.get(url)
	with open(file_path, 'wb') as f:
	f.write(response.content)
	else:
	print(f"{file_path} already exists, skipping download.")

	# Download the PDFs from the URLs
	download_pdf(url_1, file_path_1)
	download_pdf(url_2, file_path_2)

	# Load the PDFs using PyMuPDFLoader
	loader_1 = PyMuPDFLoader(file_path_1)
	documents_1 = loader_1.load()

	loader_2 = PyMuPDFLoader(file_path_2)
	documents_2 = loader_2.load()