Spaces:
Build error
Build error
Agregar estructura de proyecto
Browse files- Dockerfile +28 -0
- README.md +0 -12
- app.py +28 -0
- frontend/Index.svelte +77 -0
- frontend/package.json +15 -0
- frontend/vite.config.js +10 -0
- requirements.txt +3 -0
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
ghostscript \
|
6 |
+
tesseract-ocr \
|
7 |
+
poppler-utils \
|
8 |
+
libxml2 \
|
9 |
+
unpaper \
|
10 |
+
nodejs \
|
11 |
+
npm \
|
12 |
+
&& apt-get clean
|
13 |
+
|
14 |
+
WORKDIR /app
|
15 |
+
|
16 |
+
COPY requirements.txt requirements.txt
|
17 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
+
|
19 |
+
COPY frontend/ ./frontend/
|
20 |
+
WORKDIR /app/frontend
|
21 |
+
RUN npm install && npm run build
|
22 |
+
|
23 |
+
WORKDIR /app
|
24 |
+
COPY . .
|
25 |
+
|
26 |
+
EXPOSE 7860
|
27 |
+
|
28 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: PDF OCR
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: pink
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.7.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import gradio as gr
|
3 |
+
import os
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
import tempfile
|
6 |
+
|
7 |
+
def extract_text_from_pdf(pdf_path):
|
8 |
+
reader = PdfReader(pdf_path)
|
9 |
+
text = "".join([page.extract_text() for page in reader.pages])
|
10 |
+
return text.strip()
|
11 |
+
|
12 |
+
def process_pdf(file):
|
13 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
14 |
+
temp_pdf.write(file.read())
|
15 |
+
temp_pdf_path = temp_pdf.name
|
16 |
+
|
17 |
+
text = extract_text_from_pdf(temp_pdf_path)
|
18 |
+
return text
|
19 |
+
|
20 |
+
with gr.Blocks() as demo:
|
21 |
+
gr.Markdown("# Procesador de PDFs en Hugging Face Space")
|
22 |
+
pdf_file = gr.File(label="Carga tu PDF", file_types=[".pdf"])
|
23 |
+
text_output = gr.Textbox(label="Texto Extraído", lines=10)
|
24 |
+
process_button = gr.Button("Procesar PDF")
|
25 |
+
|
26 |
+
process_button.click(process_pdf, inputs=[pdf_file], outputs=[text_output])
|
27 |
+
|
28 |
+
demo.launch()
|
frontend/Index.svelte
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
<script lang="ts">
|
3 |
+
import pdfjsLib from "pdfjs-dist";
|
4 |
+
pdfjsLib.GlobalWorkerOptions.workerSrc = "https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.worker.min.js";
|
5 |
+
|
6 |
+
let pdfDoc;
|
7 |
+
let numPages = 1;
|
8 |
+
let currentPage = 1;
|
9 |
+
let canvasRef;
|
10 |
+
export let value;
|
11 |
+
|
12 |
+
async function loadPDF(value) {
|
13 |
+
const loadingTask = pdfjsLib.getDocument(value.url);
|
14 |
+
pdfDoc = await loadingTask.promise;
|
15 |
+
numPages = pdfDoc.numPages;
|
16 |
+
currentPage = 1;
|
17 |
+
renderPage();
|
18 |
+
}
|
19 |
+
|
20 |
+
function renderPage() {
|
21 |
+
pdfDoc.getPage(currentPage).then((page) => {
|
22 |
+
const viewport = page.getViewport({ scale: 1.5 });
|
23 |
+
const canvasContext = canvasRef.getContext("2d");
|
24 |
+
canvasRef.width = viewport.width;
|
25 |
+
canvasRef.height = viewport.height;
|
26 |
+
|
27 |
+
page.render({
|
28 |
+
canvasContext,
|
29 |
+
viewport,
|
30 |
+
});
|
31 |
+
});
|
32 |
+
}
|
33 |
+
|
34 |
+
function nextPage() {
|
35 |
+
if (currentPage < numPages) {
|
36 |
+
currentPage++;
|
37 |
+
renderPage();
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
function prevPage() {
|
42 |
+
if (currentPage > 1) {
|
43 |
+
currentPage--;
|
44 |
+
renderPage();
|
45 |
+
}
|
46 |
+
}
|
47 |
+
|
48 |
+
$: if (value) {
|
49 |
+
loadPDF(value);
|
50 |
+
}
|
51 |
+
</script>
|
52 |
+
|
53 |
+
<div>
|
54 |
+
<div class="pdf-viewer">
|
55 |
+
<canvas bind:this={canvasRef}></canvas>
|
56 |
+
</div>
|
57 |
+
<div class="controls">
|
58 |
+
<button on:click={prevPage} disabled={currentPage === 1}>⬅️ Anterior</button>
|
59 |
+
<span>{currentPage} / {numPages}</span>
|
60 |
+
<button on:click={nextPage} disabled={currentPage === numPages}>Siguiente ➡️</button>
|
61 |
+
</div>
|
62 |
+
</div>
|
63 |
+
|
64 |
+
<style>
|
65 |
+
.pdf-viewer {
|
66 |
+
display: flex;
|
67 |
+
justify-content: center;
|
68 |
+
align-items: center;
|
69 |
+
height: 500px;
|
70 |
+
}
|
71 |
+
.controls {
|
72 |
+
display: flex;
|
73 |
+
justify-content: center;
|
74 |
+
gap: 10px;
|
75 |
+
margin-top: 10px;
|
76 |
+
}
|
77 |
+
</style>
|
frontend/package.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
{
|
3 |
+
"name": "frontend",
|
4 |
+
"version": "1.0.0",
|
5 |
+
"scripts": {
|
6 |
+
"dev": "vite",
|
7 |
+
"build": "vite build"
|
8 |
+
},
|
9 |
+
"dependencies": {
|
10 |
+
"pdfjs-dist": "^3.11.174"
|
11 |
+
},
|
12 |
+
"devDependencies": {
|
13 |
+
"vite": "^4.0.0"
|
14 |
+
}
|
15 |
+
}
|
frontend/vite.config.js
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import { defineConfig } from "vite";
|
3 |
+
|
4 |
+
export default defineConfig({
|
5 |
+
root: "./",
|
6 |
+
build: {
|
7 |
+
outDir: "../dist",
|
8 |
+
emptyOutDir: true,
|
9 |
+
},
|
10 |
+
});
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
gradio
|
3 |
+
PyPDF2
|