allinaigc commited on
Commit
3a87981
·
1 Parent(s): d1fc4bf

Upload localKB_construct.py

Browse files
Files changed (1) hide show
  1. localKB_construct.py +101 -0
localKB_construct.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ 1.更新了llama-index的库。对应的函数名和用法都有所改变。
3
+ '''
4
+
5
+ # import gradio as gr
6
+ import openai
7
+ import requests
8
+ import csv
9
+ from llama_index import PromptHelper
10
+ # from llama_index import GPTSimpleVectorIndex ## renamed in the latest version.
11
+ from llama_index import LLMPredictor
12
+ from llama_index import ServiceContext
13
+ from langchain.chat_models import ChatOpenAI
14
+ from langchain import OpenAI
15
+ from fastapi import FastAPI #* 实现流式数据
16
+ from fastapi.responses import StreamingResponse #* 实现流式数据
17
+ import sys
18
+ import os
19
+ import torch
20
+ import math
21
+ import pandas as pd
22
+ import numpy as np
23
+ import PyPDF2
24
+ # from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper #* working in the previous version.
25
+
26
+ ##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
27
+ from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
28
+ from llama_index import StorageContext, load_index_from_storage
29
+ from llama_index import ServiceContext
30
+ from llama_index import download_loader
31
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
32
+ import sys
33
+ import os
34
+ from rich import print
35
+
36
+ ## enironment settings.
37
+ os.environ["OPENAI_API_KEY"] = "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"
38
+ openai.api_key = "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"
39
+ # file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
40
+ # file_path = "/Users/yunshi/Downloads/txt_dir/2023年百人会电动论坛 纪要 20230401.pdf"
41
+
42
+ ## 建立index或者的过程。
43
+ def construct_index(directory_path):
44
+ # file_path = f"{directory_path}/uploaded_file.pdf"
45
+
46
+ file_path = directory_path
47
+
48
+ # set maximum input si771006
49
+ # max_input_size = 4096 #* working
50
+ max_input_size = 4096
51
+ # set number of output tokens
52
+ # num_outputs = 3000 #* working
53
+ num_outputs = 1000
54
+ # set maximum chunk overlap
55
+ max_chunk_overlap = -1000 #* working
56
+ # set chunk size limit
57
+ # chunk_size_limit = 600
58
+ chunk_size_limit = 6000 #* working
59
+
60
+ # ## add chunk_overlap_ratio according to github.
61
+ # chunk_overlap_ratio= 0.1
62
+
63
+
64
+ # define LLM
65
+ # llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=2000))
66
+ llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo-16k", max_tokens=512,streaming=True))
67
+
68
+ ## 好像work了,2023.09.22, 注意这里的写法有调整。
69
+ # prompt_helper = PromptHelper(max_input_s≈ize, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
70
+ prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio= 0.1, chunk_size_limit=chunk_size_limit)
71
+ service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
72
+
73
+ ## 如果是txt文件,那么需要用如下命令。注意与PDF文件的区别。
74
+ # documents = SimpleDirectoryReader(directory_path).load_data()
75
+
76
+ ## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。
77
+ #NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
78
+ CJKPDFReader = download_loader("CJKPDFReader")
79
+ loader = CJKPDFReader()
80
+ # documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
81
+ documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
82
+ # index = GPTSimpleVectorIndex(
83
+ # documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
84
+ # )
85
+
86
+ # index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working.
87
+ index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed.
88
+ # index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.
89
+
90
+ return index, service_context
91
+
92
+ def process_file():
93
+ print('process_file starts')
94
+ file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
95
+ #! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
96
+ index, service_context = construct_index(file_path)
97
+ # index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
98
+ index.storage_context.persist(persist_dir=f"./") #* 存储到本地,为以后调用。
99
+ print(index)
100
+
101
+ process_file()