Vladyslav Nalyvaiko
commited on
Commit
·
ca05b65
1
Parent(s):
11551ca
Fast API update
Browse files- app.py +10 -58
- mineru_single.py +1 -5
app.py
CHANGED
@@ -10,7 +10,7 @@ from typing import List, Optional
|
|
10 |
# Alternatively you can do this in a "startup" event handler
|
11 |
os.system("python download_models_hf.py")
|
12 |
|
13 |
-
from
|
14 |
# Or if you want single-file approach, from miner_single import to_markdown
|
15 |
|
16 |
app = FastAPI()
|
@@ -22,67 +22,19 @@ os.makedirs(INBOX_DIR, exist_ok=True)
|
|
22 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
23 |
|
24 |
@app.post("/process")
|
25 |
-
async def
|
26 |
-
files: List[UploadFile] = File(...),
|
27 |
-
background_tasks: BackgroundTasks = None,
|
28 |
-
num_workers: int = 2,
|
29 |
-
num_gpus: int = 1
|
30 |
-
):
|
31 |
-
"""
|
32 |
-
POST multiple PDFs via multipart/form-data.
|
33 |
-
We store them in ./inbox, then process in the background using parallel_processor.
|
34 |
-
"""
|
35 |
-
pdf_paths = []
|
36 |
-
for f in files:
|
37 |
-
file_path = os.path.join(INBOX_DIR, f.filename)
|
38 |
-
with open(file_path, "wb") as out_file:
|
39 |
-
shutil.copyfileobj(f.file, out_file)
|
40 |
-
pdf_paths.append(file_path)
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
pdf_paths,
|
46 |
-
OUTPUT_DIR,
|
47 |
-
num_workers,
|
48 |
-
num_gpus
|
49 |
-
)
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
"files_received": [f.filename for f in files],
|
54 |
-
"workers": num_workers,
|
55 |
-
"gpus": num_gpus
|
56 |
-
}
|
57 |
-
|
58 |
-
@app.get("/check_output")
|
59 |
-
def check_output(filename: str):
|
60 |
-
"""
|
61 |
-
Simple endpoint: provide a PDF filename, returns the final .md (if ready).
|
62 |
-
We assume the PDF was named e.g. 'paper.pdf'. The final output is `paper.md`
|
63 |
-
in `./output/paper/paper_<timestamp>.md`.
|
64 |
-
Because of how we rename outputs in `miner_single.py`, you may need to locate them by pattern.
|
65 |
-
"""
|
66 |
-
name_wo_ext = os.path.splitext(filename)[0]
|
67 |
-
# Because we appended timestamp, let's see if we can locate a .md in the folder
|
68 |
-
subdir = os.path.join(OUTPUT_DIR, name_wo_ext)
|
69 |
-
if not os.path.exists(subdir):
|
70 |
-
return {"status": "not_found"}
|
71 |
-
|
72 |
-
# Try to find a .md in the subdir
|
73 |
-
found_md = [f for f in os.listdir(subdir) if f.endswith(".md")]
|
74 |
-
if not found_md:
|
75 |
-
return {"status": "incomplete"}
|
76 |
-
|
77 |
-
# If we do find it:
|
78 |
-
md_path = os.path.join(subdir, found_md[0])
|
79 |
-
with open(md_path, "r", encoding="utf-8") as f:
|
80 |
-
content = f.read()
|
81 |
|
82 |
return {
|
83 |
-
"
|
84 |
-
"
|
85 |
-
"content":
|
86 |
}
|
87 |
|
88 |
# If you want to run locally or for debug:
|
|
|
10 |
# Alternatively you can do this in a "startup" event handler
|
11 |
os.system("python download_models_hf.py")
|
12 |
|
13 |
+
from mineru_single import to_markdown
|
14 |
# Or if you want single-file approach, from miner_single import to_markdown
|
15 |
|
16 |
app = FastAPI()
|
|
|
22 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
23 |
|
24 |
@app.post("/process")
|
25 |
+
async def process_pdf(file: UploadFile = File(...)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
file_path = os.path.join(INBOX_DIR, file.filename)
|
28 |
+
with open(file_path, "wb") as out_file:
|
29 |
+
shutil.copyfileobj(file.file, out_file)
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
# Process the file and wait for completion
|
32 |
+
markdown_text = to_markdown(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
return {
|
35 |
+
"message": "Processing completed",
|
36 |
+
"code": 200,
|
37 |
+
"content": markdown_text
|
38 |
}
|
39 |
|
40 |
# If you want to run locally or for debug:
|
mineru_single.py
CHANGED
@@ -135,8 +135,4 @@ def to_markdown(
|
|
135 |
|
136 |
md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)
|
137 |
|
138 |
-
|
139 |
-
with open(md_path, "w", encoding="utf-8") as fw:
|
140 |
-
fw.write(md_content_with_embeds)
|
141 |
-
|
142 |
-
return md_path
|
|
|
135 |
|
136 |
md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)
|
137 |
|
138 |
+
return md_content_with_embeds
|
|
|
|
|
|
|
|