Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
|
|
7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
10 |
-
from read_photodocument import convert_PDF_to_Text
|
11 |
from doctr.io import DocumentFile
|
12 |
from doctr.models import ocr_predictor
|
13 |
import contextlib
|
@@ -61,6 +61,7 @@ def summarize_data(docs,llm_model,chain_type='refine'):
|
|
61 |
|
62 |
prompt = PromptTemplate.from_template(prompt_template)
|
63 |
refine_prompt = PromptTemplate.from_template(refine_template)
|
|
|
64 |
|
65 |
chain = load_summarize_chain(llm=llm_model,
|
66 |
chain_type=chain_type,
|
@@ -76,6 +77,7 @@ def summarize_data(docs,llm_model,chain_type='refine'):
|
|
76 |
consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
|
77 |
dash_id = consice_sumary.find('-')
|
78 |
return consice_sumary[:dash_id].replace(' ','\n')
|
|
|
79 |
# matches = re.finditer(regex, output_text, re.DOTALL)
|
80 |
# for matchNum, match in enumerate(matches, start=1):
|
81 |
# for groupNum in range(0, len(match.groups())):
|
@@ -115,6 +117,15 @@ def document_loader(temperature,max_tokens,api_key,model_name,file_path):
|
|
115 |
was_truncated = conversion_stats["truncated"]
|
116 |
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
if converted_txt:
|
119 |
print("Document Processed ..")
|
120 |
texts = process_documents(texts=converted_txt)
|
|
|
7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
10 |
+
from read_photodocument import convert_PDF_to_Text,convert_image_to_pdf
|
11 |
from doctr.io import DocumentFile
|
12 |
from doctr.models import ocr_predictor
|
13 |
import contextlib
|
|
|
61 |
|
62 |
prompt = PromptTemplate.from_template(prompt_template)
|
63 |
refine_prompt = PromptTemplate.from_template(refine_template)
|
64 |
+
|
65 |
|
66 |
chain = load_summarize_chain(llm=llm_model,
|
67 |
chain_type=chain_type,
|
|
|
77 |
consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
|
78 |
dash_id = consice_sumary.find('-')
|
79 |
return consice_sumary[:dash_id].replace(' ','\n')
|
80 |
+
|
81 |
# matches = re.finditer(regex, output_text, re.DOTALL)
|
82 |
# for matchNum, match in enumerate(matches, start=1):
|
83 |
# for groupNum in range(0, len(match.groups())):
|
|
|
117 |
was_truncated = conversion_stats["truncated"]
|
118 |
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
|
119 |
|
120 |
+
elif file_path.endswith('.jpg') or file_path.endswith('.jpeg'):
|
121 |
+
conversion_stats = convert_image_to_pdf(file_path,model)
|
122 |
+
converted_txt = conversion_stats["converted_text"]
|
123 |
+
num_pages = conversion_stats["num_pages"]
|
124 |
+
was_truncated = conversion_stats["truncated"]
|
125 |
+
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
|
126 |
+
|
127 |
+
else:
|
128 |
+
return ("Invalid Format ....")
|
129 |
if converted_txt:
|
130 |
print("Document Processed ..")
|
131 |
texts = process_documents(texts=converted_txt)
|