Achille Thin - Genesis commited on
Commit
4b75db9
·
1 Parent(s): 63c3d58

adding local data loading

Browse files
Files changed (1) hide show
  1. app.py +30 -13
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import os
2
  import json
 
3
  import gradio as gr
4
  from llama_index import (
5
  VectorStoreIndex,
6
  download_loader,
7
  )
8
  import chromadb
9
-
10
  from llama_index.llms import MistralAI
11
  from llama_index.embeddings import MistralAIEmbedding
12
  from llama_index.vector_stores import ChromaVectorStore
@@ -21,7 +22,7 @@ placeholder = (
21
  placeholder_url = "Extract text from this url"
22
  llm_model = "mistral-small"
23
 
24
- env_api_key = os.environ.get("MISTRAL_API_KEY")
25
  query_engine = None
26
 
27
  # Define LLMs
@@ -52,7 +53,10 @@ def get_documents_in_db():
52
  print("Fetching documents in DB")
53
  docs = []
54
  for item in chroma_collection.get(include=["metadatas"])["metadatas"]:
55
- docs.append(json.loads(item["_node_content"])["metadata"]["file_name"])
 
 
 
56
  docs = list(set(docs))
57
  print(f"Found {len(docs)} documents")
58
  out = "**List of files in db:**\n"
@@ -81,17 +85,29 @@ def load_file(file):
81
  )
82
 
83
  def load_local_data(data_folder):
84
-
85
- ids = chroma_collection.get()["ids"]
86
- chroma_collection.delete(ids)
87
- print('Cleaning DB')
88
-
89
  for file in os.listdir(data_folder):
90
- print('Adding file ' + file + ' to DB')
91
- documents = loader.load_data(file= data_folder + file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- for doc in documents:
94
- index.insert(doc)
95
 
96
  def load_document(input_file):
97
  file_name = input_file.name.split("/")[-1]
@@ -124,7 +140,6 @@ with gr.Blocks() as demo:
124
  file_msg = gr.Textbox(
125
  label="Loaded documents:", container=False, visible=False
126
  )
127
-
128
  input_file.upload(
129
  fn=load_document,
130
  inputs=[
@@ -134,6 +149,8 @@ with gr.Blocks() as demo:
134
  concurrency_limit=20,
135
  )
136
 
 
 
137
  help_msg = gr.Markdown(
138
  value="Once the document is loaded, press the Encode button below to add it to the db."
139
  )
 
1
  import os
2
  import json
3
+ import pandas as pd
4
  import gradio as gr
5
  from llama_index import (
6
  VectorStoreIndex,
7
  download_loader,
8
  )
9
  import chromadb
10
+ from llama_index import Document
11
  from llama_index.llms import MistralAI
12
  from llama_index.embeddings import MistralAIEmbedding
13
  from llama_index.vector_stores import ChromaVectorStore
 
22
  placeholder_url = "Extract text from this url"
23
  llm_model = "mistral-small"
24
 
25
+ env_api_key = 'Yb2kAF0DR4Mva5AEmoYFV3kYRAKdXB7i'#os.environ.get("MISTRAL_API_KEY")
26
  query_engine = None
27
 
28
  # Define LLMs
 
53
  print("Fetching documents in DB")
54
  docs = []
55
  for item in chroma_collection.get(include=["metadatas"])["metadatas"]:
56
+ try:
57
+ docs.append(json.loads(item["_node_content"])["metadata"]["file_name"])
58
+ except:
59
+ pass
60
  docs = list(set(docs))
61
  print(f"Found {len(docs)} documents")
62
  out = "**List of files in db:**\n"
 
85
  )
86
 
87
  def load_local_data(data_folder):
 
 
 
 
 
88
  for file in os.listdir(data_folder):
89
+ if file.endswith('.pdf'):
90
+ print('Adding file ' + file + ' to DB')
91
+ documents = loader.load_data(file= data_folder + file)
92
+ for doc in documents:
93
+ index.insert(doc)
94
+ if file.endswith('.txt'):
95
+ print('Adding file ' + file + ' to DB')
96
+ with open(data_folder + file, 'r') as f:
97
+ file_ = f.read()
98
+ index.insert(Document(text=file_))
99
+ if file=='price_by_crop.csv':
100
+ print('Adding file ' + file + ' to DB')
101
+ prices_text = 'The price of some agricultural data is given by this csv: It displays three scenario, a mean, an optimistic, and a pessimistic' + str(pd.read_csv(data_folder + file))
102
+ index.insert(Document(text=prices_text))
103
+ if file=='data_cout_production_grandes_cultures_2021_2025.xlsx':
104
+ production_costs = ""
105
+ for _, row in pd.read_excel(data_folder + file).iterrows():
106
+ if row['ANNEE']==2024:
107
+ production_costs += f"Le coût de production par tonne en moyenne pour {row['CULTURES']} était {row['MOYENNE']} euros par tonne avec un scénario moyen, {row['QUART INFERIEUR']} pour un scénario optimiste, et {row['QUART SUPERIEUR']} pour un scénario pessimiste. \n"
108
+ print('Adding file ' + file + ' to DB')
109
+ index.insert(Document(text=production_costs))
110
 
 
 
111
 
112
  def load_document(input_file):
113
  file_name = input_file.name.split("/")[-1]
 
140
  file_msg = gr.Textbox(
141
  label="Loaded documents:", container=False, visible=False
142
  )
 
143
  input_file.upload(
144
  fn=load_document,
145
  inputs=[
 
149
  concurrency_limit=20,
150
  )
151
 
152
+ load_local_data('data/')
153
+ load_local_data('data/pdf/')
154
  help_msg = gr.Markdown(
155
  value="Once the document is loaded, press the Encode button below to add it to the db."
156
  )