File size: 7,706 Bytes
e963fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import os 
import gradio as gr
import re
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from read_photodocument import convert_PDF_to_Text
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import contextlib
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S",
)

DEVICE = 'cpu'
FILE_EXT = ['pdf','jpg','jpeg']
DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ."

MAX_NEW_TOKENS = 2048
DEFAULT_TEMPERATURE = 0.1
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 2048

embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2"
local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})


with contextlib.redirect_stdout(None):
        ocr_model = ocr_predictor(
        "db_resnet50",
        "crnn_mobilenet_v3_large",
        pretrained=True,
        assume_straight_pages=True,
    )

def loading_file():
    return "Loading..."


def summarize_data(docs,llm_model,chain_type='refine'):
    prompt_template = """
    Write a concise summary of the following pointwise avoid repetion:
    {text}
    CONCISE SUMMARY:
    """
    refine_template = (
        "Your job is to produce a final summary in points.\n"
        "Existing summary up to a certain point: {existing_answer}\n"
        "write the details of summary pointwise and avoid repetion."
    )
    
    prompt = PromptTemplate.from_template(prompt_template)
    refine_prompt = PromptTemplate.from_template(refine_template)

    chain = load_summarize_chain(llm=llm_model,
                            chain_type=chain_type,
                            # question_prompt=prompt,
                            # refine_prompt=,
                            return_intermediate_steps=False,
                            input_key="input_documents",
                            output_key="output_text",
                            )
    summary = chain({"input_documents": docs}, return_only_outputs=True)
    output_text  = summary["output_text"].strip()
    regex = r"CONCISE SUMMARY:(.*)"

    matches = re.finditer(regex, output_text, re.DOTALL)
    for matchNum, match in enumerate(matches, start=1):
        for groupNum in range(0, len(match.groups())):
            groupNum = groupNum + 1
            lines = match.group(groupNum).strip().split("\n")
    return lines 


def process_documents(texts,data_chunk=1000,chunk_overlap=10):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=data_chunk,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

    texts = text_splitter.split_text(texts)
    docs = [Document(page_content=txt) for txt in texts]
    return docs

def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None):
    llm = HuggingFaceHub(
        huggingfacehub_api_token =API_key ,
        repo_id=model_id, 
        model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens}
        )
    return llm


def document_loader(temperature,max_tokens,api_key,model_name,file_path):
    model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens)
    converted_txt = None
    if file_path.endswith('.pdf'):
        conversion_stats = convert_PDF_to_Text(document_file=file_path,ocr_model=ocr_model)
        converted_txt = conversion_stats["converted_text"]
        num_pages = conversion_stats["num_pages"]
        was_truncated = conversion_stats["truncated"]
        print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
    
    if converted_txt:
        print("Document Processed ..")
        texts = process_documents(documents=converted_txt)
        lines = summarize_data(docs=texts,llm_model=model)
        return lines 
    else:
        return "Error in Processsing document "



iface = gr.Interface(
    fn= document_loader,inputs = [
    gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"),
    gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'),
    gr.Textbox(label="Add API key", type="password"),
    gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'),
    "file"
    ]
    ouputs="text",
    description ="Summarize your PDF Document having Image • HuggingFace",
)

iface.launch()

# with gr.Blocks(css=css) as demo:
#     with gr.Column(elem_id="col-container"):
#         gr.HTML(title)
        
#     with gr.Group():
#         chatbot = gr.Chatbot(height=300)
#     with gr.Row():
#         sumarize_btn = gr.Button(value="Summarize", variant="primary", scale = 1)
#         clean_chat_btn =  gr.Button("Delete Chat")

#     with gr.Column():
#         LLM_option = gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model Selection',info='LLM Service')
                 
#         with gr.Column():
#             with gr.Box():
#                 file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select type of file to upload !")
#                 pdf_doc = gr.File(label="Upload File", file_types=FILE_EXT, type="file")
#                 with gr.Accordion(label='Advanced options', open=False):
#                     max_new_tokens = gr.Slider(
#                         label='Max new tokens',
#                         minimum=512,
#                         maximum=MAX_NEW_TOKENS,
#                         step=1024,
#                         value=DEFAULT_MAX_NEW_TOKENS,
#                         )
#                     temperature = gr.Slider(
#                     label='Temperature',
#                     minimum=0.01,
#                     maximum=1.0,
#                     step=0.05,
#                     value=DEFAULT_TEMPERATURE,
#                     )
#                 with gr.Row():
#                     langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False)
#                     load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width = False)

#         # chatbot = gr.Chatbot()l̥
#         # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
#         # submit_button = gr.Button("Send Message")

#     if pdf_doc:
#         load_pdf.click(loading_file, None, langchain_status, queue=False)    
#         load_pdf.click(document_loader, inputs=[pdf_doc,file_extension,temperature,max_new_tokens], outputs=[langchain_status], queue=False)

#     #question.submit(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
#     #submit_btn.click(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
#     sumarize_btn.click()
#     # submit_btn.then(chatf.highlight_found_text, [chatbot, sources], [sources])
#     clean_chat_btn.click(clear_chat, [], chatbot)


# demo.launch()