File size: 5,238 Bytes
3a87981
 
 
 
3cc6ac7
3a87981
 
 
 
3cc6ac7
3a87981
 
 
 
b862271
 
3a87981
 
 
 
 
 
 
 
 
 
 
7a9c8fa
3a87981
 
 
3cc6ac7
3a87981
 
b862271
 
3a87981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b862271
3a87981
 
 
 
 
 
 
3cc6ac7
 
 
 
3a87981
3cc6ac7
 
 
3a87981
3cc6ac7
3a87981
 
 
 
 
3cc6ac7
3a87981
3cc6ac7
 
3a87981
 
 
 
3cc6ac7
3a87981
3cc6ac7
3a87981
 
 
3cc6ac7
3a87981
 
3cc6ac7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
'''
1.更新了llama-index的库。对应的函数名和用法都有所改变。
'''

# import gradio as gr
import openai
import requests
import csv
from llama_index import PromptHelper
# from llama_index import GPTSimpleVectorIndex ## renamed in the latest version. 
from llama_index import LLMPredictor
from llama_index import ServiceContext
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI
# from fastapi import FastAPI #* 实现流式数据
# from fastapi.responses import StreamingResponse #* 实现流式数据
import sys
import os
import math
import pandas as pd
import numpy as np
import PyPDF2

##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from llama_index import StorageContext, load_index_from_storage
from llama_index import ServiceContext
from llama_index import download_loader
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import sys
import os
from rich import print

## enironment settings. 
os.environ["OPENAI_API_KEY"] = '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'
openai.api_key = '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'
# file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
# file_path = "/Users/yunshi/Downloads/txt_dir/2023年百人会电动论坛 纪要 20230401.pdf"

## 建立index或者的过程。
def construct_index(directory_path):
    # file_path = f"{directory_path}/uploaded_file.pdf"

    file_path = directory_path

    # set maximum input si771006
    # max_input_size = 4096 #* working
    max_input_size = 4096
    # set number of output tokens
    # num_outputs = 3000 #* working
    num_outputs = 1000
    # set maximum chunk overlap
    max_chunk_overlap = -1000 #* working
    # set chunk size limit
    # chunk_size_limit = 600
    chunk_size_limit = 6000 #* working

    # ## add chunk_overlap_ratio according to github.
    # chunk_overlap_ratio= 0.1


    # define LLM
    # llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=2000))
    llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo-16k", max_tokens=512,streaming=True))
    
    ## 好像work了,2023.09.22, 注意这里的写法有调整。
    # prompt_helper = PromptHelper(max_input_s≈ize, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
    prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio= 0.1, chunk_size_limit=chunk_size_limit)
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

    ## 如果是txt文件,那么需要用如下命令。注意与PDF文件的区别。
    # documents = SimpleDirectoryReader(directory_path).load_data() 
    
    ## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。 
    #NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
    # CJKPDFReader = download_loader("CJKPDFReader") ## 最新的版本好像不行了,需要用下面的命令。
    # loader = CJKPDFReader()
    PDFReader = download_loader("PDFReader") # working。
    loader = PDFReader()
    # documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
    print('directory_path now:', directory_path)
    # print('111')
    # documents = loader.load_data(file="/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf") #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
    documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
    print('222')
    # index = GPTSimpleVectorIndex(
    #     documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
    # )

    # index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working. 
    # print('documents:', documents)
    index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed. 
    print('333')

    # index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.

    return index, service_context

def process_file(file_path,username):
    print('process_file starts')
    # file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
    #! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
    index, service_context = construct_index(file_path)
    # index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
    index.storage_context.persist(persist_dir=f"./{username}/") #* 存储到本地,为以后调用。
    print(index)

# process_file(file_path)