Spaces:

xavierbarbier
/

rag_ngap

Sleeping

File size: 4,348 Bytes

dc54faf
c34df49
 
 
 
 
 
 
56abc69
70b610d
1b23aed
c34df49
 
 
 
 
 
 
 
 
 
 
 
 
a8fbe43
c34df49
a80badd
6e9cf31
a80badd
a8fbe43
 
 
 
 
 
 
 
 
 
c34df49
 
1e5ea25
c34df49
 
 
 
 
 
1e5ea25
c34df49
 
 
 
 
 
 
70b610d
dc54faf
61f44e7
c34df49
 
 
 
 
 
 
4ec8b91
 
 
 
61f44e7
 
fd3b445
e6a57c2
fd3b445
70b610d
61f44e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bc15db
4ec8b91
0bc15db
f6305a7
 
 
ee2bff4
f6305a7
 
 
 
 
 
 
 
 
 
a8fbe43
 
efdd19e
a8fbe43
 
 
8410290
a8fbe43
 
 
70b610d
0bc15db
c34df49
f6305a7
0bc15db
 
 
f6305a7
 
 
447319c
f6305a7
c34df49
9838c31
dc54faf
c34df49

import gradio as gr
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download
import faiss
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
from pypdf import PdfReader
from gradio_pdf import PDF
from transformers import pipeline


title = "Mistral-7B-Instruct-GGUF Run On CPU-Basic Free Hardware"

description = """
🔎 [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) , 4-bit quantization balanced quality gguf version, running on CPU. English Only (Also support other languages but the quality's not good). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp) [GitHub - gpt4all](https://github.com/nomic-ai/gpt4all). 
🔨 Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue. 
Mistral does not support system prompt symbol (such as ```<<SYS>>```) now, input your system prompt in the first message if you need. Learn more: [Guardrailing Mistral 7B](https://docs.mistral.ai/usage/guardrailing). 
"""

"""
[Model From TheBloke/Mistral-7B-Instruct-v0.1-GGUF](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF)
[Mistral-instruct-v0.1 System prompt](https://docs.mistral.ai/usage/guardrailing)
"""
"""
model_path = "models"
model_name = "SmolLM-1.7B-Instruct.Q2_K.gguf"

hf_hub_download(repo_id="mradermacher/SmolLM-1.7B-Instruct-GGUF", filename=model_name, local_dir=model_path, local_dir_use_symlinks=False)
"""


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


model_name = "croissantllm/CroissantLLMBase"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("Start the model init process")
"""model = model = GPT4All(model_name, model_path, allow_download = False, device="cpu")


model.config["promptTemplate"] = "[INST] {0} [/INST]"
model.config["systemPrompt"] = "Tu es un assitant et tu dois répondre en français"
model._is_chat_session_activated = False

max_new_tokens = 2048"""

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



# creating a pdf reader object



print("Finish the model init process")

def get_text_embedding(text):

    return embeddings.embed_query(text)


# FAISS index
doc_path = hf_hub_download(repo_id="xavierbarbier/rag_ngap", filename="resource/embeddings_ngap.faiss", repo_type="space")

index = faiss.read_index(doc_path)

# Chunks
doc_path = hf_hub_download(repo_id="xavierbarbier/rag_ngap", filename="resource/NGAP 01042024.pdf", repo_type="space")

reader = PdfReader(doc_path)

text = []
for p in np.arange(0, len(reader.pages), 1):
  page = reader.pages[int(p)]

  # extracting text from page
  text.append(page.extract_text())

text = ' '.join(text)

chunk_size = 2048
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def qa(question):

    

    question_embeddings = np.array([get_text_embedding(question)])

    D, I = index.search(question_embeddings, k=1) # distance, index
    retrieved_chunk = [chunks[i] for i in I.tolist()[0]]

    prompt = f"""
        Context information is below.
        ---------------------
        {retrieved_chunk}
        ---------------------
        Given the context information and not prior knowledge, answer the query.
        Query: {question}
        Answer:
       """ 
    """
    max_new_tokens = 2048
    outputs =  model.generate(prompt=prompt, temp=0.5, top_k = 40, top_p = 1, max_tokens = max_new_tokens)"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    tokens = model.generate(**inputs, max_length=1000, do_sample=True, top_p=0.95, top_k=60, temperature=0.3)
    

    return tokenizer.decode(tokens[0])


with gr.Blocks() as demo:
    
    question_input = gr.Textbox(label="Question")
    qa_button = gr.Button("Click to qa")
        
    promp_output = gr.Textbox(label="prompt")

    
    qa_button.click(qa, question_input, promp_output)
    
    

if __name__ == "__main__":
    demo.queue(max_size=3).launch()