Vladyslav Nalyvaiko commited on
Commit
ca05b65
·
1 Parent(s): 11551ca

Fast API update

Browse files
Files changed (2) hide show
  1. app.py +10 -58
  2. mineru_single.py +1 -5
app.py CHANGED
@@ -10,7 +10,7 @@ from typing import List, Optional
10
  # Alternatively you can do this in a "startup" event handler
11
  os.system("python download_models_hf.py")
12
 
13
- from parallel_multiproc import process_batch_in_parallel
14
  # Or if you want single-file approach, from miner_single import to_markdown
15
 
16
  app = FastAPI()
@@ -22,67 +22,19 @@ os.makedirs(INBOX_DIR, exist_ok=True)
22
  os.makedirs(OUTPUT_DIR, exist_ok=True)
23
 
24
  @app.post("/process")
25
- async def process_pdfs_in_parallel(
26
- files: List[UploadFile] = File(...),
27
- background_tasks: BackgroundTasks = None,
28
- num_workers: int = 2,
29
- num_gpus: int = 1
30
- ):
31
- """
32
- POST multiple PDFs via multipart/form-data.
33
- We store them in ./inbox, then process in the background using parallel_processor.
34
- """
35
- pdf_paths = []
36
- for f in files:
37
- file_path = os.path.join(INBOX_DIR, f.filename)
38
- with open(file_path, "wb") as out_file:
39
- shutil.copyfileobj(f.file, out_file)
40
- pdf_paths.append(file_path)
41
 
42
- # Launch parallel processing in background (so we can return immediately)
43
- background_tasks.add_task(
44
- process_batch_in_parallel,
45
- pdf_paths,
46
- OUTPUT_DIR,
47
- num_workers,
48
- num_gpus
49
- )
50
 
51
- return {
52
- "message": "Processing started",
53
- "files_received": [f.filename for f in files],
54
- "workers": num_workers,
55
- "gpus": num_gpus
56
- }
57
-
58
- @app.get("/check_output")
59
- def check_output(filename: str):
60
- """
61
- Simple endpoint: provide a PDF filename, returns the final .md (if ready).
62
- We assume the PDF was named e.g. 'paper.pdf'. The final output is `paper.md`
63
- in `./output/paper/paper_<timestamp>.md`.
64
- Because of how we rename outputs in `miner_single.py`, you may need to locate them by pattern.
65
- """
66
- name_wo_ext = os.path.splitext(filename)[0]
67
- # Because we appended timestamp, let's see if we can locate a .md in the folder
68
- subdir = os.path.join(OUTPUT_DIR, name_wo_ext)
69
- if not os.path.exists(subdir):
70
- return {"status": "not_found"}
71
-
72
- # Try to find a .md in the subdir
73
- found_md = [f for f in os.listdir(subdir) if f.endswith(".md")]
74
- if not found_md:
75
- return {"status": "incomplete"}
76
-
77
- # If we do find it:
78
- md_path = os.path.join(subdir, found_md[0])
79
- with open(md_path, "r", encoding="utf-8") as f:
80
- content = f.read()
81
 
82
  return {
83
- "status": "complete",
84
- "markdown_file": found_md[0],
85
- "content": content
86
  }
87
 
88
  # If you want to run locally or for debug:
 
10
  # Alternatively you can do this in a "startup" event handler
11
  os.system("python download_models_hf.py")
12
 
13
+ from mineru_single import to_markdown
14
  # Or if you want single-file approach, from miner_single import to_markdown
15
 
16
  app = FastAPI()
 
22
  os.makedirs(OUTPUT_DIR, exist_ok=True)
23
 
24
  @app.post("/process")
25
+ async def process_pdf(file: UploadFile = File(...)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ file_path = os.path.join(INBOX_DIR, file.filename)
28
+ with open(file_path, "wb") as out_file:
29
+ shutil.copyfileobj(file.file, out_file)
 
 
 
 
 
30
 
31
+ # Process the file and wait for completion
32
+ markdown_text = to_markdown(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  return {
35
+ "message": "Processing completed",
36
+ "code": 200,
37
+ "content": markdown_text
38
  }
39
 
40
  # If you want to run locally or for debug:
mineru_single.py CHANGED
@@ -135,8 +135,4 @@ def to_markdown(
135
 
136
  md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)
137
 
138
- # Overwrite the original Markdown with the embedded one
139
- with open(md_path, "w", encoding="utf-8") as fw:
140
- fw.write(md_content_with_embeds)
141
-
142
- return md_path
 
135
 
136
  md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)
137
 
138
+ return md_content_with_embeds