pvanand commited on
Commit
43c94a5
1 Parent(s): ca0425f

add csv data indexing

Browse files
Files changed (1) hide show
  1. main.py +37 -0
main.py CHANGED
@@ -6,6 +6,8 @@ import json
6
  import os
7
  import logging
8
  from txtai.embeddings import Embeddings
 
 
9
 
10
  # Set up logging
11
  logging.basicConfig(level=logging.INFO)
@@ -105,6 +107,41 @@ async def query_index(request: QueryRequest):
105
  logger.error(f"Error querying index: {str(e)}")
106
  raise HTTPException(status_code=500, detail=f"Error querying index: {str(e)}")
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  if __name__ == "__main__":
109
  import uvicorn
110
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
6
  import os
7
  import logging
8
  from txtai.embeddings import Embeddings
9
+ import pandas as pd
10
+ import glob
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO)
 
107
  logger.error(f"Error querying index: {str(e)}")
108
  raise HTTPException(status_code=500, detail=f"Error querying index: {str(e)}")
109
 
110
+ def process_csv_file(file_path):
111
+ try:
112
+ df = pd.read_csv(file_path)
113
+ df_rows = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
114
+ txtai_data = [(i, row, None) for i, row in enumerate(df_rows)]
115
+ return txtai_data, df_rows.tolist()
116
+ except Exception as e:
117
+ logger.error(f"Error processing CSV file {file_path}: {str(e)}")
118
+ return None, None
119
+
120
+ def check_and_index_csv_files():
121
+ index_data_folder = "/app/index_data"
122
+ if not os.path.exists(index_data_folder):
123
+ logger.warning(f"index_data folder not found: {index_data_folder}")
124
+ return
125
+
126
+ csv_files = glob.glob(os.path.join(index_data_folder, "*.csv"))
127
+ for csv_file in csv_files:
128
+ index_id = os.path.splitext(os.path.basename(csv_file))[0]
129
+ if not os.path.exists(f"/app/indexes/{index_id}"):
130
+ logger.info(f"Processing CSV file: {csv_file}")
131
+ txtai_data, documents = process_csv_file(csv_file)
132
+ if txtai_data and documents:
133
+ embeddings.index(txtai_data)
134
+ save_embeddings(index_id, documents)
135
+ logger.info(f"CSV file indexed successfully: {csv_file}")
136
+ else:
137
+ logger.warning(f"Failed to process CSV file: {csv_file}")
138
+ else:
139
+ logger.info(f"Index already exists for: {csv_file}")
140
+
141
+ @app.on_event("startup")
142
+ async def startup_event():
143
+ check_and_index_csv_files()
144
+
145
  if __name__ == "__main__":
146
  import uvicorn
147
  uvicorn.run(app, host="0.0.0.0", port=8000)