File size: 5,288 Bytes
e963fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8cec98
e963fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
ec20d6c
e963fa4
 
 
 
 
 
951cf22
fe84c5e
882ceed
fe84c5e
951cf22
fe84c5e
 
 
 
 
c95667e
e963fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6bcd19
e963fa4
 
 
 
 
 
 
9f75b50
e963fa4
c95667e
e963fa4
 
 
 
 
 
 
2c58ef3
 
 
 
 
 
 
eef8834
fe84c5e
e963fa4
 
9f75b50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os 
import gradio as gr
import re
from langchain.embeddings.base import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from read_photodocument import convert_PDF_to_Text
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import contextlib
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S",
)

DEVICE = 'cpu'
FILE_EXT = ['pdf','jpg','jpeg']
DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ."

MAX_NEW_TOKENS = 2048
DEFAULT_TEMPERATURE = 0.1
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 2048

embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2"
local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})


with contextlib.redirect_stdout(None):
        ocr_model = ocr_predictor(
        "db_resnet50",
        "crnn_mobilenet_v3_large",
        pretrained=True,
        assume_straight_pages=True,
    )

def loading_file():
    return "Loading..."


def summarize_data(docs,llm_model,chain_type='refine'):
    prompt_template = """
    Write a concise summary of the following text pointwise without repeating sentences:
    {text}
    CONCISE SUMMARY:
    """
    refine_template = (
        "Your job is to produce a final summary in points.\n"
        "Existing summary up to a certain point: {existing_answer}\n"
        "write the details of summary pointwise and avoid repetion."
    )
    
    prompt = PromptTemplate.from_template(prompt_template)
    refine_prompt = PromptTemplate.from_template(refine_template)

    chain = load_summarize_chain(llm=llm_model,
                            chain_type=chain_type,
                            # question_prompt=prompt,
                            # refine_prompt=,
                            return_intermediate_steps=False,
                            input_key="input_documents",
                            output_key="output_text",
                            )
    summary = chain({"input_documents": docs}, return_only_outputs=True)
    output_text  = summary["output_text"].replace('\n',' ')
    
    consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
    dash_id = consice_sumary.find('-')
    return consice_sumary[:dash_id].replace('  ','\n')
    # matches = re.finditer(regex, output_text, re.DOTALL)
    # for matchNum, match in enumerate(matches, start=1):
    #     for groupNum in range(0, len(match.groups())):
    #         groupNum = groupNum + 1
    #         lines = match.group(groupNum).strip().split("\n")
    


def process_documents(texts,data_chunk=1000,chunk_overlap=10):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=data_chunk,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

    texts = text_splitter.split_text(texts)
    docs = [Document(page_content=txt) for txt in texts]
    return docs

def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None):
    llm = HuggingFaceHub(
        huggingfacehub_api_token =API_key ,
        repo_id=model_id, 
        model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens}
        )
    return llm


def document_loader(temperature,max_tokens,api_key,model_name,file_path):
    model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens)
    converted_txt = None
    if file_path.endswith('.pdf'):
        conversion_stats = convert_PDF_to_Text(PDF_file=file_path,ocr_model=ocr_model)
        converted_txt = conversion_stats["converted_text"]
        num_pages = conversion_stats["num_pages"]
        was_truncated = conversion_stats["truncated"]
        print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
    
    if converted_txt:
        print("Document Processed ..")
        texts = process_documents(texts=converted_txt)
        lines = summarize_data(docs=texts,llm_model=model)

        return lines 
    else:
        return "Error in Processsing document "



iface = gr.Interface(
    fn = document_loader,
    inputs = [gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"),
        gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'),
        gr.Textbox(label="Add API key", type="password"),
        gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'),
        "file"
    ],
    outputs="text",
    description ="Summarize your PDF Document having Image  • HuggingFace",
)

iface.launch()