File size: 2,099 Bytes
4d3ca32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# python 3.8.5
"""
Given an Arxiv url, downloads the Tex files from the e-print URL,
opens the directory that was downloaded, and concatenatnes all the .tex files together
"""
import os
import sys
import requests
import shutil
import gzip
import glob
import subprocess
import time
import re
import argparse
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def download_arxiv(url, output_dir):
    """
    Given an Arxiv url, downloads the Tex files from the e-print URL
    For example, the url https://arxiv.org/abs/2206.13947
    """
    logger.info("Downloading %s", url)
    # Get the arxiv id from the url
    arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1)
    # Download the gz archive
    filename = os.path.join(output_dir, f"{arxiv_id}.gz")
    with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r:
        with open(filename, "wb") as f:
            shutil.copyfileobj(r.raw, f)
    return filename


def concat_tex_files_in_archive(archive_path, output_dir):
    """
    Given an archive path, extracts the tex files and concatenates them together
    """
    # Extract the gz archive
    with gzip.open(archive_path, "rb") as f_in:
        with open(archive_path[:-3], "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
    # Concatenate the tex files
    tex_files = glob.glob(f"{archive_path[:-3]}/*.tex")
    with open(os.path.join(output_dir, "concat.tex"), "w") as f:
        for tex_file in tex_files:
            with open(tex_file) as f_in:
                f.write(f_in.read())
    # Remove the extracted file and gz archive
    # os.remove(archive_path[:-3])
    # os.remove(archive_path)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("url", type=str, help="Arxiv URL")
    parser.add_argument("--output_dir", type=str, default=".", help="Output directory")
    args = parser.parse_args()

    archive_path = download_arxiv(args.url, args.output_dir)
    concat_tex_files_in_archive(archive_path, args.output_dir)


if __name__ == "__main__":
    main()