paperstocode / downloader.py
reibs's picture
util downloader
4d3ca32
raw
history blame
2.1 kB
# python 3.8.5
"""
Given an Arxiv url, downloads the Tex files from the e-print URL,
opens the directory that was downloaded, and concatenatnes all the .tex files together
"""
import os
import sys
import requests
import shutil
import gzip
import glob
import subprocess
import time
import re
import argparse
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def download_arxiv(url, output_dir):
"""
Given an Arxiv url, downloads the Tex files from the e-print URL
For example, the url https://arxiv.org/abs/2206.13947
"""
logger.info("Downloading %s", url)
# Get the arxiv id from the url
arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1)
# Download the gz archive
filename = os.path.join(output_dir, f"{arxiv_id}.gz")
with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r:
with open(filename, "wb") as f:
shutil.copyfileobj(r.raw, f)
return filename
def concat_tex_files_in_archive(archive_path, output_dir):
"""
Given an archive path, extracts the tex files and concatenates them together
"""
# Extract the gz archive
with gzip.open(archive_path, "rb") as f_in:
with open(archive_path[:-3], "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
# Concatenate the tex files
tex_files = glob.glob(f"{archive_path[:-3]}/*.tex")
with open(os.path.join(output_dir, "concat.tex"), "w") as f:
for tex_file in tex_files:
with open(tex_file) as f_in:
f.write(f_in.read())
# Remove the extracted file and gz archive
# os.remove(archive_path[:-3])
# os.remove(archive_path)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("url", type=str, help="Arxiv URL")
parser.add_argument("--output_dir", type=str, default=".", help="Output directory")
args = parser.parse_args()
archive_path = download_arxiv(args.url, args.output_dir)
concat_tex_files_in_archive(archive_path, args.output_dir)
if __name__ == "__main__":
main()