allinaigc commited on
Commit
3cc6ac7
·
1 Parent(s): 053df90

Upload localKB_construct.py

Browse files
Files changed (1) hide show
  1. localKB_construct.py +18 -8
localKB_construct.py CHANGED
@@ -2,10 +2,12 @@
2
  1.更新了llama-index的库。对应的函数名和用法都有所改变。
3
  '''
4
 
 
5
  import openai
6
  import requests
7
  import csv
8
  from llama_index import PromptHelper
 
9
  from llama_index import LLMPredictor
10
  from llama_index import ServiceContext
11
  from langchain.chat_models import ChatOpenAI
@@ -18,7 +20,6 @@ import math
18
  import pandas as pd
19
  import numpy as np
20
  import PyPDF2
21
- # from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper #* working in the previous version.
22
 
23
  ##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
24
  from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
@@ -28,7 +29,7 @@ from llama_index import download_loader
28
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
29
  import sys
30
  import os
31
-
32
 
33
  ## enironment settings.
34
  os.environ["OPENAI_API_KEY"] = "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"
@@ -72,27 +73,36 @@ def construct_index(directory_path):
72
 
73
  ## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。
74
  #NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
75
- CJKPDFReader = download_loader("CJKPDFReader")
76
- loader = CJKPDFReader()
 
 
77
  # documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
 
 
 
78
  documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
 
79
  # index = GPTSimpleVectorIndex(
80
  # documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
81
  # )
82
 
83
  # index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working.
 
84
  index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed.
 
 
85
  # index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.
86
 
87
  return index, service_context
88
 
89
- def process_file():
90
  print('process_file starts')
91
- file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
92
  #! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
93
  index, service_context = construct_index(file_path)
94
  # index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
95
- index.storage_context.persist(persist_dir=f"./") #* 存储到本地,为以后调用。
96
  print(index)
97
 
98
- process_file()
 
2
  1.更新了llama-index的库。对应的函数名和用法都有所改变。
3
  '''
4
 
5
+ # import gradio as gr
6
  import openai
7
  import requests
8
  import csv
9
  from llama_index import PromptHelper
10
+ # from llama_index import GPTSimpleVectorIndex ## renamed in the latest version.
11
  from llama_index import LLMPredictor
12
  from llama_index import ServiceContext
13
  from langchain.chat_models import ChatOpenAI
 
20
  import pandas as pd
21
  import numpy as np
22
  import PyPDF2
 
23
 
24
  ##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
25
  from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
 
29
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
30
  import sys
31
  import os
32
+ from rich import print
33
 
34
  ## enironment settings.
35
  os.environ["OPENAI_API_KEY"] = "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"
 
73
 
74
  ## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。
75
  #NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
76
+ # CJKPDFReader = download_loader("CJKPDFReader") ## 最新的版本好像不行了,需要用下面的命令。
77
+ # loader = CJKPDFReader()
78
+ PDFReader = download_loader("PDFReader") # working。
79
+ loader = PDFReader()
80
  # documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
81
+ print('directory_path now:', directory_path)
82
+ # print('111')
83
+ # documents = loader.load_data(file="/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf") #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
84
  documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
85
+ print('222')
86
  # index = GPTSimpleVectorIndex(
87
  # documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
88
  # )
89
 
90
  # index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working.
91
+ # print('documents:', documents)
92
  index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed.
93
+ print('333')
94
+
95
  # index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.
96
 
97
  return index, service_context
98
 
99
+ def process_file(file_path,username):
100
  print('process_file starts')
101
+ # file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
102
  #! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
103
  index, service_context = construct_index(file_path)
104
  # index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
105
+ index.storage_context.persist(persist_dir=f"./{username}/") #* 存储到本地,为以后调用。
106
  print(index)
107
 
108
+ # process_file(file_path)