Upload localKB_construct.py
Browse files- localKB_construct.py +101 -0
localKB_construct.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
1.更新了llama-index的库。对应的函数名和用法都有所改变。
|
3 |
+
'''
|
4 |
+
|
5 |
+
# import gradio as gr
|
6 |
+
import openai
|
7 |
+
import requests
|
8 |
+
import csv
|
9 |
+
from llama_index import PromptHelper
|
10 |
+
# from llama_index import GPTSimpleVectorIndex ## renamed in the latest version.
|
11 |
+
from llama_index import LLMPredictor
|
12 |
+
from llama_index import ServiceContext
|
13 |
+
from langchain.chat_models import ChatOpenAI
|
14 |
+
from langchain import OpenAI
|
15 |
+
from fastapi import FastAPI #* 实现流式数据
|
16 |
+
from fastapi.responses import StreamingResponse #* 实现流式数据
|
17 |
+
import sys
|
18 |
+
import os
|
19 |
+
import torch
|
20 |
+
import math
|
21 |
+
import pandas as pd
|
22 |
+
import numpy as np
|
23 |
+
import PyPDF2
|
24 |
+
# from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper #* working in the previous version.
|
25 |
+
|
26 |
+
##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
|
27 |
+
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
|
28 |
+
from llama_index import StorageContext, load_index_from_storage
|
29 |
+
from llama_index import ServiceContext
|
30 |
+
from llama_index import download_loader
|
31 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
32 |
+
import sys
|
33 |
+
import os
|
34 |
+
from rich import print
|
35 |
+
|
36 |
+
## enironment settings.
|
37 |
+
os.environ["OPENAI_API_KEY"] = "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"
|
38 |
+
openai.api_key = "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"
|
39 |
+
# file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
|
40 |
+
# file_path = "/Users/yunshi/Downloads/txt_dir/2023年百人会电动论坛 纪要 20230401.pdf"
|
41 |
+
|
42 |
+
## 建立index或者的过程。
|
43 |
+
def construct_index(directory_path):
|
44 |
+
# file_path = f"{directory_path}/uploaded_file.pdf"
|
45 |
+
|
46 |
+
file_path = directory_path
|
47 |
+
|
48 |
+
# set maximum input si771006
|
49 |
+
# max_input_size = 4096 #* working
|
50 |
+
max_input_size = 4096
|
51 |
+
# set number of output tokens
|
52 |
+
# num_outputs = 3000 #* working
|
53 |
+
num_outputs = 1000
|
54 |
+
# set maximum chunk overlap
|
55 |
+
max_chunk_overlap = -1000 #* working
|
56 |
+
# set chunk size limit
|
57 |
+
# chunk_size_limit = 600
|
58 |
+
chunk_size_limit = 6000 #* working
|
59 |
+
|
60 |
+
# ## add chunk_overlap_ratio according to github.
|
61 |
+
# chunk_overlap_ratio= 0.1
|
62 |
+
|
63 |
+
|
64 |
+
# define LLM
|
65 |
+
# llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=2000))
|
66 |
+
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo-16k", max_tokens=512,streaming=True))
|
67 |
+
|
68 |
+
## 好像work了,2023.09.22, 注意这里的写法有调整。
|
69 |
+
# prompt_helper = PromptHelper(max_input_s≈ize, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
|
70 |
+
prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio= 0.1, chunk_size_limit=chunk_size_limit)
|
71 |
+
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
|
72 |
+
|
73 |
+
## 如果是txt文件,那么需要用如下命令。注意与PDF文件的区别。
|
74 |
+
# documents = SimpleDirectoryReader(directory_path).load_data()
|
75 |
+
|
76 |
+
## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。
|
77 |
+
#NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
|
78 |
+
CJKPDFReader = download_loader("CJKPDFReader")
|
79 |
+
loader = CJKPDFReader()
|
80 |
+
# documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
81 |
+
documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
82 |
+
# index = GPTSimpleVectorIndex(
|
83 |
+
# documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
|
84 |
+
# )
|
85 |
+
|
86 |
+
# index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working.
|
87 |
+
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed.
|
88 |
+
# index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.
|
89 |
+
|
90 |
+
return index, service_context
|
91 |
+
|
92 |
+
def process_file():
|
93 |
+
print('process_file starts')
|
94 |
+
file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
|
95 |
+
#! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
|
96 |
+
index, service_context = construct_index(file_path)
|
97 |
+
# index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
|
98 |
+
index.storage_context.persist(persist_dir=f"./") #* 存储到本地,为以后调用。
|
99 |
+
print(index)
|
100 |
+
|
101 |
+
process_file()
|