# python 3.8.5 """ Given an Arxiv url, downloads the Tex files from the e-print URL, opens the directory that was downloaded, and concatenatnes all the .tex files together """ import os import sys import requests import shutil import gzip import glob import subprocess import time import re import argparse import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def download_arxiv(url, output_dir): """ Given an Arxiv url, downloads the Tex files from the e-print URL For example, the url https://arxiv.org/abs/2206.13947 """ logger.info("Downloading %s", url) # Get the arxiv id from the url arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1) # Download the gz archive filename = os.path.join(output_dir, f"{arxiv_id}.gz") with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r: with open(filename, "wb") as f: shutil.copyfileobj(r.raw, f) return filename def concat_tex_files_in_archive(archive_path, output_dir): """ Given an archive path, extracts the tex files and concatenates them together """ # Extract the gz archive with gzip.open(archive_path, "rb") as f_in: with open(archive_path[:-3], "wb") as f_out: shutil.copyfileobj(f_in, f_out) # Concatenate the tex files tex_files = glob.glob(f"{archive_path[:-3]}/*.tex") with open(os.path.join(output_dir, "concat.tex"), "w") as f: for tex_file in tex_files: with open(tex_file) as f_in: f.write(f_in.read()) # Remove the extracted file and gz archive # os.remove(archive_path[:-3]) # os.remove(archive_path) def main(): parser = argparse.ArgumentParser() parser.add_argument("url", type=str, help="Arxiv URL") parser.add_argument("--output_dir", type=str, default=".", help="Output directory") args = parser.parse_args() archive_path = download_arxiv(args.url, args.output_dir) concat_tex_files_in_archive(archive_path, args.output_dir) if __name__ == "__main__": main()