minar09 commited on
Commit
c6743f5
·
verified ·
1 Parent(s): 556fc00

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +2 -5
  2. main.py +18 -10
app.py CHANGED
@@ -2,9 +2,6 @@ import os
2
  import gradio as gr
3
  import main
4
 
5
- #os.environ["CUDA_VISIBLE_DEVICES"]='0'
6
- #os.environ["USE_GPU"]="True"
7
-
8
 
9
  def predict_from_pdf(pdf_file):
10
  upload_dir = "./catalogue/"
@@ -35,9 +32,9 @@ demo = gr.Interface(
35
  outputs=["json", "text"],
36
  examples=pdf_examples,
37
  title="Open Source PDF Catalog Parser",
38
- description="Efficient PDF catalog processing using PyMuPDF and OpenLLM",
39
  article="Uses MinerU for layout analysis and DeepSeek-7B for structured extraction"
40
  )
41
 
42
  if __name__ == "__main__":
43
- demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)
 
2
  import gradio as gr
3
  import main
4
 
 
 
 
5
 
6
  def predict_from_pdf(pdf_file):
7
  upload_dir = "./catalogue/"
 
32
  outputs=["json", "text"],
33
  examples=pdf_examples,
34
  title="Open Source PDF Catalog Parser",
35
+ description="Efficient PDF catalog processing using MinerU and OpenLLM",
36
  article="Uses MinerU for layout analysis and DeepSeek-7B for structured extraction"
37
  )
38
 
39
  if __name__ == "__main__":
40
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)
main.py CHANGED
@@ -42,23 +42,26 @@ class PDFProcessor:
42
  self.output_dir.mkdir(exist_ok=True)
43
 
44
  def _initialize_emb_model(self, model_name):
45
- # model = SentenceTransformer("sentence-transformers/" + model_name)
46
- # model = SentenceTransformer(model_name)
47
- # model.save('models/'+ model_name)
48
- # Load model directly
49
- from transformers import AutoTokenizer, AutoModel
 
 
50
 
51
- tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
52
- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
53
- return model
54
 
55
  def _initialize_llm(self, model_name):
56
  """Initialize LLM with automatic download if needed"""
 
57
  model_path = os.path.join("models/", model_name)
58
  if os.path.exists(model_path):
59
  return Llama(
60
  model_path=model_path,
61
- n_ctx=4096,
62
  n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
63
  n_threads=os.cpu_count() - 1,
64
  verbose=False
@@ -67,11 +70,16 @@ class PDFProcessor:
67
  return Llama.from_pretrained(
68
  repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
69
  filename=model_name,
70
- n_ctx=4096,
71
  n_threads=os.cpu_count() - 1,
72
  n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
73
  verbose=False
74
  )
 
 
 
 
 
75
 
76
  def process_pdf(self, pdf_path: str) -> Dict:
77
  """Process PDF using MinerU pipeline"""
 
42
  self.output_dir.mkdir(exist_ok=True)
43
 
44
  def _initialize_emb_model(self, model_name):
45
+ try:
46
+ model = SentenceTransformer("sentence-transformers/" + model_name)
47
+ model.save('models/'+ model_name)
48
+ return model
49
+ except:
50
+ # Load model directly
51
+ from transformers import AutoTokenizer, AutoModel
52
 
53
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
54
+ model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
55
+ return model
56
 
57
  def _initialize_llm(self, model_name):
58
  """Initialize LLM with automatic download if needed"""
59
+ """
60
  model_path = os.path.join("models/", model_name)
61
  if os.path.exists(model_path):
62
  return Llama(
63
  model_path=model_path,
64
+ n_ctx=2048,
65
  n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
66
  n_threads=os.cpu_count() - 1,
67
  verbose=False
 
70
  return Llama.from_pretrained(
71
  repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
72
  filename=model_name,
73
+ n_ctx=2048,
74
  n_threads=os.cpu_count() - 1,
75
  n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
76
  verbose=False
77
  )
78
+ """
79
+ # Load model directly
80
+ from transformers import AutoModel
81
+ model = AutoModel.from_pretrained("TheBloke/deepseek-llm-7B-base-GGUF")
82
+ return model
83
 
84
  def process_pdf(self, pdf_path: str) -> Dict:
85
  """Process PDF using MinerU pipeline"""