import os from json import dumps, loads import numpy as np import pandas as pd from dotenv import load_dotenv from llama_index import (Document, GPTVectorStoreIndex, LLMPredictor, PromptHelper, ServiceContext, StorageContext, load_index_from_storage) from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from utils.customLLM import CustomLLM load_dotenv() # get model # model_name = "bigscience/bloom-560m" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config') # define prompt helper # set maximum input size context_window = 2048 # set number of output tokens num_output = 525 # set maximum chunk overlap chunk_overlap_ratio = 0.2 prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio) # create a pipeline # pl = pipeline( # model=model, # tokenizer=tokenizer, # task="text-generation", # # device=0, # GPU device number # # max_length=512, # do_sample=True, # top_p=0.95, # top_k=50, # temperature=0.7 # ) # define llm llm_predictor = LLMPredictor(llm=CustomLLM()) service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) def prepare_data(file_path:str): df = pd.read_json(file_path) df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values parsed = loads(df.to_json(orient="records")) documents = [] for item in parsed: document = Document(item['paragraphText'], item['_id']['$oid'], extra_info={"chapter": item['chapter'], "article": item['article'], "title": item['title']}) documents.append(document) return documents def initialize_index(index_name): file_path = f"./vectorStores/{index_name}" if os.path.exists(file_path): # rebuild storage context storage_context = StorageContext.from_defaults(persist_dir=file_path) # load index index = load_index_from_storage(storage_context) return GPTVectorStoreIndex.load_from_disk(file_path) else: documents = prepare_data(r"./assets/regItems.json") index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) index.storage_context.persist(file_path) return index