maxidl commited on
Commit
3879030
·
1 Parent(s): efc95b5
Files changed (1) hide show
  1. app.py +38 -2
app.py CHANGED
@@ -10,9 +10,20 @@ from marker.pdf.extract_text import get_length_of_text
10
  from marker.models import load_all_models
11
  from marker.settings import settings
12
  from marker.logger import configure_logging
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- model_name = "maxidl/arena-test"
15
 
 
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_name,
18
  torch_dtype=torch.bfloat16,
@@ -25,8 +36,33 @@ title = "# Placeholder Title"
25
  steps = """Placeholder Description"""
26
  # steps = """1. Converts uploaded pdf file to markdown. You can edit the intermediate markdown output.\n2. Generates a review for the paper"""
27
 
 
 
 
 
 
 
 
28
  def process_file(file):
29
- return "Processed file"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  @spaces.GPU(duration=60)
 
10
  from marker.models import load_all_models
11
  from marker.settings import settings
12
  from marker.logger import configure_logging
13
+ import traceback
14
+
15
+
16
+ configure_logging()
17
+ MAX_PAGES = 20
18
+ MIN_LENGTH=200
19
+ settings.EXTRACT_IMAGES = False
20
+ settings.DEBUG = False
21
+
22
+ model_refs = load_all_models()
23
+ metadata = {}
24
 
 
25
 
26
+ model_name = "maxidl/arena-test"
27
  model = AutoModelForCausalLM.from_pretrained(
28
  model_name,
29
  torch_dtype=torch.bfloat16,
 
36
  steps = """Placeholder Description"""
37
  # steps = """1. Converts uploaded pdf file to markdown. You can edit the intermediate markdown output.\n2. Generates a review for the paper"""
38
 
39
+ @spaces.GPU(duration=60)
40
+ def convert_file(filepath):
41
+ full_text, images, out_metadata = convert_single_pdf(
42
+ filepath, model_refs, metadata=metadata, max_pages=MAX_PAGES
43
+ )
44
+ return full_text.strip()
45
+
46
  def process_file(file):
47
+ print(file.name)
48
+ filepath = file.name
49
+ try:
50
+ if MIN_LENGTH:
51
+ filetype = find_filetype(filepath)
52
+ if filetype == "other":
53
+ raise ValueError()
54
+
55
+ length = get_length_of_text(filepath)
56
+ if length < MIN_LENGTH:
57
+ raise ValueError()
58
+ paper_text = convert_file(filepath)
59
+ if not len(paper_text) > MIN_LENGTH:
60
+ raise ValueError()
61
+ except Exception as e:
62
+ print(traceback.format_exc())
63
+ print(f"Error converting {filepath}: {e}")
64
+ return "Error processing pdf"
65
+ return paper_text
66
 
67
 
68
  @spaces.GPU(duration=60)