Spaces:
Runtime error
Runtime error
# python 3.8.5 | |
""" | |
Given an Arxiv url, downloads the Tex files from the e-print URL, | |
opens the directory that was downloaded, and concatenatnes all the .tex files together | |
""" | |
import os | |
import sys | |
import requests | |
import shutil | |
import gzip | |
import glob | |
import subprocess | |
import time | |
import re | |
import argparse | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def download_arxiv(url, output_dir): | |
""" | |
Given an Arxiv url, downloads the Tex files from the e-print URL | |
For example, the url https://arxiv.org/abs/2206.13947 | |
""" | |
logger.info("Downloading %s", url) | |
# Get the arxiv id from the url | |
arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1) | |
# Download the gz archive | |
filename = os.path.join(output_dir, f"{arxiv_id}.gz") | |
with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r: | |
with open(filename, "wb") as f: | |
shutil.copyfileobj(r.raw, f) | |
return filename | |
def concat_tex_files_in_archive(archive_path, output_dir): | |
""" | |
Given an archive path, extracts the tex files and concatenates them together | |
""" | |
# Extract the gz archive | |
with gzip.open(archive_path, "rb") as f_in: | |
with open(archive_path[:-3], "wb") as f_out: | |
shutil.copyfileobj(f_in, f_out) | |
# Concatenate the tex files | |
tex_files = glob.glob(f"{archive_path[:-3]}/*.tex") | |
with open(os.path.join(output_dir, "concat.tex"), "w") as f: | |
for tex_file in tex_files: | |
with open(tex_file) as f_in: | |
f.write(f_in.read()) | |
# Remove the extracted file and gz archive | |
# os.remove(archive_path[:-3]) | |
# os.remove(archive_path) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("url", type=str, help="Arxiv URL") | |
parser.add_argument("--output_dir", type=str, default=".", help="Output directory") | |
args = parser.parse_args() | |
archive_path = download_arxiv(args.url, args.output_dir) | |
concat_tex_files_in_archive(archive_path, args.output_dir) | |
if __name__ == "__main__": | |
main() | |