raannakasturi commited on
Commit
a871aff
·
1 Parent(s): 9183d8e

Update extract_text.py to include User-Agent header in PDF download requests

Browse files
Files changed (1) hide show
  1. extract_text.py +3 -3
extract_text.py CHANGED
@@ -6,10 +6,10 @@ def download_pdf(url, id):
6
  id = id.replace("/", "-")
7
  directory = "downloads"
8
  os.makedirs(directory, exist_ok=True)
9
- file_path = os.path.join(directory, f"{id}.pdf") # Use a unique name based on id
10
  try:
11
- response = requests.get(url)
12
- response.raise_for_status() # Raise an error for bad responses
13
  with open(file_path, "wb") as file:
14
  file.write(response.content)
15
  except Exception as e:
 
6
  id = id.replace("/", "-")
7
  directory = "downloads"
8
  os.makedirs(directory, exist_ok=True)
9
+ file_path = os.path.join(directory, f"{id}.pdf")
10
  try:
11
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"})
12
+ response.raise_for_status()
13
  with open(file_path, "wb") as file:
14
  file.write(response.content)
15
  except Exception as e: