ppsingh commited on
Commit
72ee60b
·
verified ·
1 Parent(s): 7474f53

Update auditqa/process_chunks.py

Browse files
Files changed (1) hide show
  1. auditqa/process_chunks.py +13 -50
auditqa/process_chunks.py CHANGED
@@ -49,48 +49,15 @@ def load_chunks():
49
  # 'source'=='category', 'subtype', these are used in UI for document selection
50
  # which will be used later for filtering database
51
  config = getconfig("./model_params.cfg")
52
- all_documents = {}
53
- categories = list(files.keys())
54
- # iterate through 'source'
55
- for category in categories:
56
- print("documents splitting in source:",category)
57
- all_documents[category] = []
58
- subtypes = list(files[category].keys())
59
- # iterate through 'subtype' within the source
60
- # example source/category == 'District', has subtypes which is district names
61
- for subtype in subtypes:
62
- print("document splitting for subtype:",subtype)
63
- for file in files[category][subtype]:
64
-
65
- # load the chunks
66
- try:
67
- doc_processed = open_file(path_to_data + file + "/"+ file+ ".chunks.json" )
68
-
69
-
70
- except Exception as e:
71
- print("Exception: ", e)
72
- print("chunks in subtype:",subtype, "are:",len(doc_processed))
73
-
74
- # add metadata information
75
- chunks_list = []
76
- for doc in doc_processed:
77
- chunks_list.append(Document(page_content= doc['content'],
78
- metadata={"source": category,
79
- "subtype":subtype,
80
- "year":file[-4:],
81
- "filename":file,
82
- "page":doc['metadata']['page'],
83
- "headings":doc['metadata']['headings']}))
84
-
85
- all_documents[category].append(chunks_list)
86
-
87
- # convert list of list to flat list
88
- for key, docs_processed in all_documents.items():
89
- docs_processed = [item for sublist in docs_processed for item in sublist]
90
- print("length of chunks in source:",key, "are:",len(docs_processed))
91
- all_documents[key] = docs_processed
92
- all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
93
- all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
94
  # define embedding model
95
  embeddings = HuggingFaceEmbeddings(
96
  model_kwargs = {'device': device},
@@ -99,16 +66,12 @@ def load_chunks():
99
  )
100
  # placeholder for collection
101
  qdrant_collections = {}
102
-
103
-
104
- for file,value in all_documents.items():
105
- if file == "allreports":
106
- print("emebddings for:",file)
107
- qdrant_collections[file] = Qdrant.from_documents(
108
- value,
109
  embeddings,
110
  path="/data/local_qdrant",
111
- collection_name=file,
112
  )
113
  print(qdrant_collections)
114
  print("vector embeddings done")
 
49
  # 'source'=='category', 'subtype', these are used in UI for document selection
50
  # which will be used later for filtering database
51
  config = getconfig("./model_params.cfg")
52
+
53
+ doc_processed = open_file(path_to_data + "new_chunks.json" )
54
+ chunks_list = []
55
+
56
+ for doc in doc_processed:
57
+ chunks_list.append(Document(page_content= doc['content'],
58
+ metadata=doc['metadata']
59
+ ))
60
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # define embedding model
62
  embeddings = HuggingFaceEmbeddings(
63
  model_kwargs = {'device': device},
 
66
  )
67
  # placeholder for collection
68
  qdrant_collections = {}
69
+ print("embeddings started")
70
+ qdrant_collections['reportsFeb2025'] = Qdrant.from_documents(
71
+ chunks_list,
 
 
 
 
72
  embeddings,
73
  path="/data/local_qdrant",
74
+ collection_name='reportsFeb2025',
75
  )
76
  print(qdrant_collections)
77
  print("vector embeddings done")