reibs commited on
Commit
4d3ca32
1 Parent(s): db06182

util downloader

Browse files
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. downloader.py +68 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
downloader.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python 3.8.5
2
+ """
3
+ Given an Arxiv url, downloads the Tex files from the e-print URL,
4
+ opens the directory that was downloaded, and concatenatnes all the .tex files together
5
+ """
6
+ import os
7
+ import sys
8
+ import requests
9
+ import shutil
10
+ import gzip
11
+ import glob
12
+ import subprocess
13
+ import time
14
+ import re
15
+ import argparse
16
+ import logging
17
+
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def download_arxiv(url, output_dir):
23
+ """
24
+ Given an Arxiv url, downloads the Tex files from the e-print URL
25
+ For example, the url https://arxiv.org/abs/2206.13947
26
+ """
27
+ logger.info("Downloading %s", url)
28
+ # Get the arxiv id from the url
29
+ arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1)
30
+ # Download the gz archive
31
+ filename = os.path.join(output_dir, f"{arxiv_id}.gz")
32
+ with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r:
33
+ with open(filename, "wb") as f:
34
+ shutil.copyfileobj(r.raw, f)
35
+ return filename
36
+
37
+
38
+ def concat_tex_files_in_archive(archive_path, output_dir):
39
+ """
40
+ Given an archive path, extracts the tex files and concatenates them together
41
+ """
42
+ # Extract the gz archive
43
+ with gzip.open(archive_path, "rb") as f_in:
44
+ with open(archive_path[:-3], "wb") as f_out:
45
+ shutil.copyfileobj(f_in, f_out)
46
+ # Concatenate the tex files
47
+ tex_files = glob.glob(f"{archive_path[:-3]}/*.tex")
48
+ with open(os.path.join(output_dir, "concat.tex"), "w") as f:
49
+ for tex_file in tex_files:
50
+ with open(tex_file) as f_in:
51
+ f.write(f_in.read())
52
+ # Remove the extracted file and gz archive
53
+ # os.remove(archive_path[:-3])
54
+ # os.remove(archive_path)
55
+
56
+
57
+ def main():
58
+ parser = argparse.ArgumentParser()
59
+ parser.add_argument("url", type=str, help="Arxiv URL")
60
+ parser.add_argument("--output_dir", type=str, default=".", help="Output directory")
61
+ args = parser.parse_args()
62
+
63
+ archive_path = download_arxiv(args.url, args.output_dir)
64
+ concat_tex_files_in_archive(archive_path, args.output_dir)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()