Spaces:

Samarth991
/

Summarize-PhotoDocument

Sleeping

App Files Files Community

Summarize-PhotoDocument / app.py

Samarth991

adding app

e963fa4 over 1 year ago

raw

history blame

7.71 kB

	import os
	import gradio as gr
	import re
	from langchain.vectorstores import FAISS
	from langchain.embeddings.base import Embeddings
	from typing import List
	from sentence_transformers import SentenceTransformer
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.prompts import PromptTemplate
	from langchain_community.llms.huggingface_hub import HuggingFaceHub
	from read_photodocument import convert_PDF_to_Text
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor
	import contextlib
	from langchain.schema import Document
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.chains.summarize import load_summarize_chain
	import logging

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)s %(message)s",
	datefmt="%m/%d/%Y %I:%M:%S",
	)

	DEVICE = 'cpu'
	FILE_EXT = ['pdf','jpg','jpeg']
	DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ."

	MAX_NEW_TOKENS = 2048
	DEFAULT_TEMPERATURE = 0.1
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = 2048

	embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2"
	local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})


	with contextlib.redirect_stdout(None):
	ocr_model = ocr_predictor(
	"db_resnet50",
	"crnn_mobilenet_v3_large",
	pretrained=True,
	assume_straight_pages=True,
	)

	def loading_file():
	return "Loading..."


	def summarize_data(docs,llm_model,chain_type='refine'):
	prompt_template = """
	Write a concise summary of the following pointwise avoid repetion:
	{text}
	CONCISE SUMMARY:
	"""
	refine_template = (
	"Your job is to produce a final summary in points.\n"
	"Existing summary up to a certain point: {existing_answer}\n"
	"write the details of summary pointwise and avoid repetion."
	)

	prompt = PromptTemplate.from_template(prompt_template)
	refine_prompt = PromptTemplate.from_template(refine_template)

	chain = load_summarize_chain(llm=llm_model,
	chain_type=chain_type,
	# question_prompt=prompt,
	# refine_prompt=,
	return_intermediate_steps=False,
	input_key="input_documents",
	output_key="output_text",
	)
	summary = chain({"input_documents": docs}, return_only_outputs=True)
	output_text = summary["output_text"].strip()
	regex = r"CONCISE SUMMARY:(.*)"

	matches = re.finditer(regex, output_text, re.DOTALL)
	for matchNum, match in enumerate(matches, start=1):
	for groupNum in range(0, len(match.groups())):
	groupNum = groupNum + 1
	lines = match.group(groupNum).strip().split("\n")
	return lines


	def process_documents(texts,data_chunk=1000,chunk_overlap=10):
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=data_chunk,
	chunk_overlap=chunk_overlap,
	length_function=len
	)

	texts = text_splitter.split_text(texts)
	docs = [Document(page_content=txt) for txt in texts]
	return docs

	def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None):
	llm = HuggingFaceHub(
	huggingfacehub_api_token =API_key ,
	repo_id=model_id,
	model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens}
	)
	return llm


	def document_loader(temperature,max_tokens,api_key,model_name,file_path):
	model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens)
	converted_txt = None
	if file_path.endswith('.pdf'):
	conversion_stats = convert_PDF_to_Text(document_file=file_path,ocr_model=ocr_model)
	converted_txt = conversion_stats["converted_text"]
	num_pages = conversion_stats["num_pages"]
	was_truncated = conversion_stats["truncated"]
	print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))

	if converted_txt:
	print("Document Processed ..")
	texts = process_documents(documents=converted_txt)
	lines = summarize_data(docs=texts,llm_model=model)
	return lines
	else:
	return "Error in Processsing document "



	iface = gr.Interface(
	fn= document_loader,inputs = [
	gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"),
	gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'),
	gr.Textbox(label="Add API key", type="password"),
	gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'),
	"file"
	]
	ouputs="text",
	description ="Summarize your PDF Document having Image • HuggingFace",
	)

	iface.launch()

	# with gr.Blocks(css=css) as demo:
	# with gr.Column(elem_id="col-container"):
	# gr.HTML(title)

	# with gr.Group():
	# chatbot = gr.Chatbot(height=300)
	# with gr.Row():
	# sumarize_btn = gr.Button(value="Summarize", variant="primary", scale = 1)
	# clean_chat_btn = gr.Button("Delete Chat")

	# with gr.Column():
	# LLM_option = gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model Selection',info='LLM Service')

	# with gr.Column():
	# with gr.Box():
	# file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select type of file to upload !")
	# pdf_doc = gr.File(label="Upload File", file_types=FILE_EXT, type="file")
	# with gr.Accordion(label='Advanced options', open=False):
	# max_new_tokens = gr.Slider(
	# label='Max new tokens',
	# minimum=512,
	# maximum=MAX_NEW_TOKENS,
	# step=1024,
	# value=DEFAULT_MAX_NEW_TOKENS,
	# )
	# temperature = gr.Slider(
	# label='Temperature',
	# minimum=0.01,
	# maximum=1.0,
	# step=0.05,
	# value=DEFAULT_TEMPERATURE,
	# )
	# with gr.Row():
	# langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False)
	# load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width = False)

	# # chatbot = gr.Chatbot()l̥
	# # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
	# # submit_button = gr.Button("Send Message")

	# if pdf_doc:
	# load_pdf.click(loading_file, None, langchain_status, queue=False)
	# load_pdf.click(document_loader, inputs=[pdf_doc,file_extension,temperature,max_new_tokens], outputs=[langchain_status], queue=False)

	# #question.submit(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
	# #submit_btn.click(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
	# sumarize_btn.click()
	# # submit_btn.then(chatf.highlight_found_text, [chatbot, sources], [sources])
	# clean_chat_btn.click(clear_chat, [], chatbot)


	# demo.launch()