Spaces:

augmentedimaginationhackathon
/

paperstocode

Runtime error

App Files Files Community

paperstocode / downloader.py

reibs

util downloader

4d3ca32 over 2 years ago

raw

history blame

2.1 kB

	# python 3.8.5
	"""
	Given an Arxiv url, downloads the Tex files from the e-print URL,
	opens the directory that was downloaded, and concatenatnes all the .tex files together
	"""
	import os
	import sys
	import requests
	import shutil
	import gzip
	import glob
	import subprocess
	import time
	import re
	import argparse
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def download_arxiv(url, output_dir):
	"""
	Given an Arxiv url, downloads the Tex files from the e-print URL
	For example, the url https://arxiv.org/abs/2206.13947
	"""
	logger.info("Downloading %s", url)
	# Get the arxiv id from the url
	arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1)
	# Download the gz archive
	filename = os.path.join(output_dir, f"{arxiv_id}.gz")
	with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r:
	with open(filename, "wb") as f:
	shutil.copyfileobj(r.raw, f)
	return filename


	def concat_tex_files_in_archive(archive_path, output_dir):
	"""
	Given an archive path, extracts the tex files and concatenates them together
	"""
	# Extract the gz archive
	with gzip.open(archive_path, "rb") as f_in:
	with open(archive_path[:-3], "wb") as f_out:
	shutil.copyfileobj(f_in, f_out)
	# Concatenate the tex files
	tex_files = glob.glob(f"{archive_path[:-3]}/*.tex")
	with open(os.path.join(output_dir, "concat.tex"), "w") as f:
	for tex_file in tex_files:
	with open(tex_file) as f_in:
	f.write(f_in.read())
	# Remove the extracted file and gz archive
	# os.remove(archive_path[:-3])
	# os.remove(archive_path)


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("url", type=str, help="Arxiv URL")
	parser.add_argument("--output_dir", type=str, default=".", help="Output directory")
	args = parser.parse_args()

	archive_path = download_arxiv(args.url, args.output_dir)
	concat_tex_files_in_archive(archive_path, args.output_dir)


	if __name__ == "__main__":
	main()