Spaces:
Runtime error
Runtime error
Achille Thin - Genesis
commited on
Commit
·
4b75db9
1
Parent(s):
63c3d58
adding local data loading
Browse files
app.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import os
|
2 |
import json
|
|
|
3 |
import gradio as gr
|
4 |
from llama_index import (
|
5 |
VectorStoreIndex,
|
6 |
download_loader,
|
7 |
)
|
8 |
import chromadb
|
9 |
-
|
10 |
from llama_index.llms import MistralAI
|
11 |
from llama_index.embeddings import MistralAIEmbedding
|
12 |
from llama_index.vector_stores import ChromaVectorStore
|
@@ -21,7 +22,7 @@ placeholder = (
|
|
21 |
placeholder_url = "Extract text from this url"
|
22 |
llm_model = "mistral-small"
|
23 |
|
24 |
-
env_api_key = os.environ.get("MISTRAL_API_KEY")
|
25 |
query_engine = None
|
26 |
|
27 |
# Define LLMs
|
@@ -52,7 +53,10 @@ def get_documents_in_db():
|
|
52 |
print("Fetching documents in DB")
|
53 |
docs = []
|
54 |
for item in chroma_collection.get(include=["metadatas"])["metadatas"]:
|
55 |
-
|
|
|
|
|
|
|
56 |
docs = list(set(docs))
|
57 |
print(f"Found {len(docs)} documents")
|
58 |
out = "**List of files in db:**\n"
|
@@ -81,17 +85,29 @@ def load_file(file):
|
|
81 |
)
|
82 |
|
83 |
def load_local_data(data_folder):
|
84 |
-
|
85 |
-
ids = chroma_collection.get()["ids"]
|
86 |
-
chroma_collection.delete(ids)
|
87 |
-
print('Cleaning DB')
|
88 |
-
|
89 |
for file in os.listdir(data_folder):
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
for doc in documents:
|
94 |
-
index.insert(doc)
|
95 |
|
96 |
def load_document(input_file):
|
97 |
file_name = input_file.name.split("/")[-1]
|
@@ -124,7 +140,6 @@ with gr.Blocks() as demo:
|
|
124 |
file_msg = gr.Textbox(
|
125 |
label="Loaded documents:", container=False, visible=False
|
126 |
)
|
127 |
-
|
128 |
input_file.upload(
|
129 |
fn=load_document,
|
130 |
inputs=[
|
@@ -134,6 +149,8 @@ with gr.Blocks() as demo:
|
|
134 |
concurrency_limit=20,
|
135 |
)
|
136 |
|
|
|
|
|
137 |
help_msg = gr.Markdown(
|
138 |
value="Once the document is loaded, press the Encode button below to add it to the db."
|
139 |
)
|
|
|
1 |
import os
|
2 |
import json
|
3 |
+
import pandas as pd
|
4 |
import gradio as gr
|
5 |
from llama_index import (
|
6 |
VectorStoreIndex,
|
7 |
download_loader,
|
8 |
)
|
9 |
import chromadb
|
10 |
+
from llama_index import Document
|
11 |
from llama_index.llms import MistralAI
|
12 |
from llama_index.embeddings import MistralAIEmbedding
|
13 |
from llama_index.vector_stores import ChromaVectorStore
|
|
|
22 |
placeholder_url = "Extract text from this url"
|
23 |
llm_model = "mistral-small"
|
24 |
|
25 |
+
env_api_key = 'Yb2kAF0DR4Mva5AEmoYFV3kYRAKdXB7i'#os.environ.get("MISTRAL_API_KEY")
|
26 |
query_engine = None
|
27 |
|
28 |
# Define LLMs
|
|
|
53 |
print("Fetching documents in DB")
|
54 |
docs = []
|
55 |
for item in chroma_collection.get(include=["metadatas"])["metadatas"]:
|
56 |
+
try:
|
57 |
+
docs.append(json.loads(item["_node_content"])["metadata"]["file_name"])
|
58 |
+
except:
|
59 |
+
pass
|
60 |
docs = list(set(docs))
|
61 |
print(f"Found {len(docs)} documents")
|
62 |
out = "**List of files in db:**\n"
|
|
|
85 |
)
|
86 |
|
87 |
def load_local_data(data_folder):
|
|
|
|
|
|
|
|
|
|
|
88 |
for file in os.listdir(data_folder):
|
89 |
+
if file.endswith('.pdf'):
|
90 |
+
print('Adding file ' + file + ' to DB')
|
91 |
+
documents = loader.load_data(file= data_folder + file)
|
92 |
+
for doc in documents:
|
93 |
+
index.insert(doc)
|
94 |
+
if file.endswith('.txt'):
|
95 |
+
print('Adding file ' + file + ' to DB')
|
96 |
+
with open(data_folder + file, 'r') as f:
|
97 |
+
file_ = f.read()
|
98 |
+
index.insert(Document(text=file_))
|
99 |
+
if file=='price_by_crop.csv':
|
100 |
+
print('Adding file ' + file + ' to DB')
|
101 |
+
prices_text = 'The price of some agricultural data is given by this csv: It displays three scenario, a mean, an optimistic, and a pessimistic' + str(pd.read_csv(data_folder + file))
|
102 |
+
index.insert(Document(text=prices_text))
|
103 |
+
if file=='data_cout_production_grandes_cultures_2021_2025.xlsx':
|
104 |
+
production_costs = ""
|
105 |
+
for _, row in pd.read_excel(data_folder + file).iterrows():
|
106 |
+
if row['ANNEE']==2024:
|
107 |
+
production_costs += f"Le coût de production par tonne en moyenne pour {row['CULTURES']} était {row['MOYENNE']} euros par tonne avec un scénario moyen, {row['QUART INFERIEUR']} pour un scénario optimiste, et {row['QUART SUPERIEUR']} pour un scénario pessimiste. \n"
|
108 |
+
print('Adding file ' + file + ' to DB')
|
109 |
+
index.insert(Document(text=production_costs))
|
110 |
|
|
|
|
|
111 |
|
112 |
def load_document(input_file):
|
113 |
file_name = input_file.name.split("/")[-1]
|
|
|
140 |
file_msg = gr.Textbox(
|
141 |
label="Loaded documents:", container=False, visible=False
|
142 |
)
|
|
|
143 |
input_file.upload(
|
144 |
fn=load_document,
|
145 |
inputs=[
|
|
|
149 |
concurrency_limit=20,
|
150 |
)
|
151 |
|
152 |
+
load_local_data('data/')
|
153 |
+
load_local_data('data/pdf/')
|
154 |
help_msg = gr.Markdown(
|
155 |
value="Once the document is loaded, press the Encode button below to add it to the db."
|
156 |
)
|