Spaces:

augmentedimaginationhackathon
/

paperstocode

Runtime error

App Files Files Community

reibs commited on Feb 26, 2023

Commit

4d3ca32

1 Parent(s): db06182

util downloader

Browse files

Files changed (2) hide show

.DS_Store +0 -0
downloader.py +68 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

downloader.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# python 3.8.5
+"""
+Given an Arxiv url, downloads the Tex files from the e-print URL,
+opens the directory that was downloaded, and concatenatnes all the .tex files together
+"""
+import os
+import sys
+import requests
+import shutil
+import gzip
+import glob
+import subprocess
+import time
+import re
+import argparse
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def download_arxiv(url, output_dir):
+    """
+    Given an Arxiv url, downloads the Tex files from the e-print URL
+    For example, the url https://arxiv.org/abs/2206.13947
+    """
+    logger.info("Downloading %s", url)
+    # Get the arxiv id from the url
+    arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1)
+    # Download the gz archive
+    filename = os.path.join(output_dir, f"{arxiv_id}.gz")
+    with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r:
+        with open(filename, "wb") as f:
+            shutil.copyfileobj(r.raw, f)
+    return filename
+def concat_tex_files_in_archive(archive_path, output_dir):
+    """
+    Given an archive path, extracts the tex files and concatenates them together
+    """
+    # Extract the gz archive
+    with gzip.open(archive_path, "rb") as f_in:
+        with open(archive_path[:-3], "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    # Concatenate the tex files
+    tex_files = glob.glob(f"{archive_path[:-3]}/*.tex")
+    with open(os.path.join(output_dir, "concat.tex"), "w") as f:
+        for tex_file in tex_files:
+            with open(tex_file) as f_in:
+                f.write(f_in.read())
+    # Remove the extracted file and gz archive
+    # os.remove(archive_path[:-3])
+    # os.remove(archive_path)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("url", type=str, help="Arxiv URL")
+    parser.add_argument("--output_dir", type=str, default=".", help="Output directory")
+    args = parser.parse_args()
+    archive_path = download_arxiv(args.url, args.output_dir)
+    concat_tex_files_in_archive(archive_path, args.output_dir)
+if __name__ == "__main__":
+    main()