swaroop-uddandarao commited on
Commit
fed116a
·
1 Parent(s): 408ab70

modified reports

Browse files
report/Scores for RAGBenchCapstone.xlsx CHANGED
Binary files a/report/Scores for RAGBenchCapstone.xlsx and b/report/Scores for RAGBenchCapstone.xlsx differ
 
report/analyze_scores.py CHANGED
@@ -38,7 +38,7 @@ def create_performance_comparison(milvus_llama, weaviate_mistral, milvus_mistral
38
  'Milvus + Mistral': milvus_mistral['Retrieval_Time'].dropna()
39
  }
40
  sns.boxplot(data=pd.DataFrame(data), ax=axes[0,0])
41
- axes[0,0].set_title('Retrieval Time Comparison')
42
  axes[0,0].set_ylabel('Time (seconds)')
43
  axes[0,0].tick_params(axis='x', rotation=45)
44
 
@@ -76,26 +76,121 @@ def create_performance_comparison(milvus_llama, weaviate_mistral, milvus_mistral
76
  plt.savefig('report/visualizations/performance_comparison.png', dpi=300, bbox_inches='tight')
77
  plt.close()
78
 
79
- def create_correlation_heatmaps(milvus_llama, weaviate_mistral, milvus_mistral):
80
- plt.figure(figsize=(20, 6))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # Create correlation heatmaps for each configuration
83
- plt.subplot(1, 3, 1)
84
- sns.heatmap(milvus_llama.corr(), annot=True, cmap='coolwarm', fmt='.2f', square=True)
85
- plt.title('Milvus + LLaMA Correlations')
86
 
87
- plt.subplot(1, 3, 2)
88
- sns.heatmap(weaviate_mistral.corr(), annot=True, cmap='coolwarm', fmt='.2f', square=True)
89
- plt.title('Weaviate + Mistral Correlations')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- plt.subplot(1, 3, 3)
92
- sns.heatmap(milvus_mistral.corr(), annot=True, cmap='coolwarm', fmt='.2f', square=True)
93
- plt.title('Milvus + Mistral Correlations')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  plt.tight_layout()
96
- plt.savefig('report/visualizations/correlation_heatmaps.png', dpi=300, bbox_inches='tight')
97
  plt.close()
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def create_violin_plots(milvus_llama, weaviate_mistral, milvus_mistral):
100
  metrics = ['RMSE_Context_Rel', 'RMSE_Context_Util', 'AUCROC']
101
 
@@ -137,7 +232,7 @@ def main():
137
 
138
  # Create visualizations
139
  create_performance_comparison(milvus_llama, weaviate_mistral, milvus_mistral)
140
- create_correlation_heatmaps(milvus_llama, weaviate_mistral, milvus_mistral)
141
  create_violin_plots(milvus_llama, weaviate_mistral, milvus_mistral)
142
 
143
  # Print statistics
 
38
  'Milvus + Mistral': milvus_mistral['Retrieval_Time'].dropna()
39
  }
40
  sns.boxplot(data=pd.DataFrame(data), ax=axes[0,0])
41
+ axes[0,0].set_title('VectorDB Retrieval Time Comparison')
42
  axes[0,0].set_ylabel('Time (seconds)')
43
  axes[0,0].tick_params(axis='x', rotation=45)
44
 
 
76
  plt.savefig('report/visualizations/performance_comparison.png', dpi=300, bbox_inches='tight')
77
  plt.close()
78
 
79
+ def create_correlation_plots(milvus_llama, weaviate_mistral, milvus_mistral):
80
+ # Create separate plots for each model
81
+
82
+ # 1. Milvus + LLaMA
83
+ plt.figure(figsize=(15, 10))
84
+
85
+ # Relevance comparison
86
+ plt.subplot(2, 1, 1)
87
+ plt.plot(range(len(milvus_llama)), milvus_llama['RMSE_Context_Rel'], 'o--',
88
+ color='red', label='RMSE Context Relevance', linewidth=2, alpha=0.7)
89
+ plt.plot(range(len(milvus_llama)), milvus_llama['Context_Relevance'], 'o-',
90
+ color='darkred', label='Context Relevance', linewidth=2, alpha=0.7)
91
+ plt.title('Milvus + LLaMA: Context Relevance vs RMSE')
92
+ plt.xlabel('Data Points')
93
+ plt.ylabel('Score')
94
+ plt.grid(True, linestyle='--', alpha=0.7)
95
+ plt.legend()
96
+
97
+ # Utilization comparison
98
+ plt.subplot(2, 1, 2)
99
+ plt.plot(range(len(milvus_llama)), milvus_llama['RMSE_Context_Util'], 'o--',
100
+ color='blue', label='RMSE Context Utilization', linewidth=2, alpha=0.7)
101
+ plt.plot(range(len(milvus_llama)), milvus_llama['Context_Utilization'], 'o-',
102
+ color='darkblue', label='Context Utilization', linewidth=2, alpha=0.7)
103
+ plt.title('Milvus + LLaMA: Context Utilization vs RMSE')
104
+ plt.xlabel('Data Points')
105
+ plt.ylabel('Score')
106
+ plt.grid(True, linestyle='--', alpha=0.7)
107
+ plt.legend()
108
 
109
+ plt.tight_layout()
110
+ plt.savefig('report/visualizations/milvus_llama_plots.png', bbox_inches='tight', dpi=300)
111
+ plt.close()
 
112
 
113
+ # 2. Weaviate + Mistral
114
+ plt.figure(figsize=(15, 10))
115
+
116
+ # Relevance comparison
117
+ plt.subplot(2, 1, 1)
118
+ plt.plot(range(len(weaviate_mistral)), weaviate_mistral['RMSE_Context_Rel'], 'o--',
119
+ color='red', label='RMSE Context Relevance', linewidth=2, alpha=0.7)
120
+ plt.plot(range(len(weaviate_mistral)), weaviate_mistral['Context_Rel'], 'o-',
121
+ color='darkred', label='Context Relevance', linewidth=2, alpha=0.7)
122
+ plt.title('Weaviate + Mistral: Context Relevance vs RMSE')
123
+ plt.xlabel('Data Points')
124
+ plt.ylabel('Score')
125
+ plt.grid(True, linestyle='--', alpha=0.7)
126
+ plt.legend()
127
+
128
+ # Utilization comparison
129
+ plt.subplot(2, 1, 2)
130
+ plt.plot(range(len(weaviate_mistral)), weaviate_mistral['RMSE_Context_Util'], 'o--',
131
+ color='blue', label='RMSE Context Utilization', linewidth=2, alpha=0.7)
132
+ plt.plot(range(len(weaviate_mistral)), weaviate_mistral['Util'], 'o-',
133
+ color='darkblue', label='Context Utilization', linewidth=2, alpha=0.7)
134
+ plt.title('Weaviate + Mistral: Context Utilization vs RMSE')
135
+ plt.xlabel('Data Points')
136
+ plt.ylabel('Score')
137
+ plt.grid(True, linestyle='--', alpha=0.7)
138
+ plt.legend()
139
+
140
+ plt.tight_layout()
141
+ plt.savefig('report/visualizations/weaviate_mistral_plots.png', bbox_inches='tight', dpi=300)
142
+ plt.close()
143
 
144
+ # 3. Milvus + Mistral
145
+ plt.figure(figsize=(15, 10))
146
+
147
+ # Relevance comparison
148
+ plt.subplot(2, 1, 1)
149
+ plt.plot(range(len(milvus_mistral)), milvus_mistral['RMSE_Context_Rel'], 'o--',
150
+ color='red', label='RMSE Context Relevance', linewidth=2, alpha=0.7)
151
+ plt.plot(range(len(milvus_mistral)), milvus_mistral['Context_Rel'], 'o-',
152
+ color='darkred', label='Context Relevance', linewidth=2, alpha=0.7)
153
+ plt.title('Milvus + Mistral: Context Relevance vs RMSE')
154
+ plt.xlabel('Data Points')
155
+ plt.ylabel('Score')
156
+ plt.grid(True, linestyle='--', alpha=0.7)
157
+ plt.legend()
158
+
159
+ # Utilization comparison
160
+ plt.subplot(2, 1, 2)
161
+ plt.plot(range(len(milvus_mistral)), milvus_mistral['RMSE_Context_Util'], 'o--',
162
+ color='blue', label='RMSE Context Utilization', linewidth=2, alpha=0.7)
163
+ plt.plot(range(len(milvus_mistral)), milvus_mistral['Util'], 'o-',
164
+ color='darkblue', label='Context Utilization', linewidth=2, alpha=0.7)
165
+ plt.title('Milvus + Mistral: Context Utilization vs RMSE')
166
+ plt.xlabel('Data Points')
167
+ plt.ylabel('Score')
168
+ plt.grid(True, linestyle='--', alpha=0.7)
169
+ plt.legend()
170
 
171
  plt.tight_layout()
172
+ plt.savefig('report/visualizations/milvus_mistral_plots.png', bbox_inches='tight', dpi=300)
173
  plt.close()
174
 
175
+ # Print statistical analysis for each model
176
+ print("\nStatistical Analysis:")
177
+
178
+ models = {
179
+ 'Milvus + LLaMA': (milvus_llama['RMSE_Context_Rel'], milvus_llama['Context_Relevance'],
180
+ milvus_llama['RMSE_Context_Util'], milvus_llama['Context_Utilization']),
181
+ 'Weaviate + Mistral': (weaviate_mistral['RMSE_Context_Rel'], weaviate_mistral['Context_Rel'],
182
+ weaviate_mistral['RMSE_Context_Util'], weaviate_mistral['Util']),
183
+ 'Milvus + Mistral': (milvus_mistral['RMSE_Context_Rel'], milvus_mistral['Context_Rel'],
184
+ milvus_mistral['RMSE_Context_Util'], milvus_mistral['Util'])
185
+ }
186
+
187
+ for model, (rmse_rel, rel, rmse_util, util) in models.items():
188
+ print(f"\n{model}:")
189
+ print(f"Context Relevance - Mean: {rel.mean():.3f}, Std: {rel.std():.3f}")
190
+ print(f"RMSE Context Rel - Mean: {rmse_rel.mean():.3f}, Std: {rmse_rel.std():.3f}")
191
+ print(f"Context Utilization - Mean: {util.mean():.3f}, Std: {util.std():.3f}")
192
+ print(f"RMSE Context Util - Mean: {rmse_util.mean():.3f}, Std: {rmse_util.std():.3f}")
193
+
194
  def create_violin_plots(milvus_llama, weaviate_mistral, milvus_mistral):
195
  metrics = ['RMSE_Context_Rel', 'RMSE_Context_Util', 'AUCROC']
196
 
 
232
 
233
  # Create visualizations
234
  create_performance_comparison(milvus_llama, weaviate_mistral, milvus_mistral)
235
+ create_correlation_plots(milvus_llama, weaviate_mistral, milvus_mistral)
236
  create_violin_plots(milvus_llama, weaviate_mistral, milvus_mistral)
237
 
238
  # Print statistics
report/architecture.md ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Benchmark Evaluation System Architecture
2
+
3
+ ## High-Level Architecture Overview
4
+
5
+ The system follows a modular architecture with the following key components:
6
+
7
+ ### 1. Data Layer
8
+
9
+ - **Dataset Loading** (loaddataset.py)
10
+
11
+ - Handles RAGBench dataset loading from HuggingFace
12
+ - Processes multiple dataset configurations
13
+ - Extracts and normalizes data
14
+
15
+ - **Vector Database** (Milvus)
16
+ - Stores document embeddings
17
+ - Enables efficient similarity search
18
+ - Manages metadata and scores
19
+
20
+ ### 2. Processing Layer
21
+
22
+ - **Document Processing**
23
+
24
+ - Chunking (insertmilvushelper.py)
25
+ - Sliding window implementation
26
+ - Overlap management
27
+
28
+ - **Embedding Generation**
29
+ - SentenceTransformer models
30
+ - Vector representation creation
31
+ - Dimension reduction
32
+
33
+ ### 3. Search & Retrieval Layer
34
+
35
+ - **Vector Search** (searchmilvushelper.py)
36
+
37
+ - Cosine similarity computation
38
+ - Top-K retrieval
39
+ - Result ranking
40
+
41
+ - **Reranking System** (finetuneresults.py)
42
+ - Multiple reranker options (MS MARCO, MonoT5)
43
+ - Context relevance scoring
44
+ - Result refinement
45
+
46
+ ### 4. Generation Layer
47
+
48
+ - **LLM Integration** (generationhelper.py)
49
+ - Multiple model support (LLaMA, Mistral)
50
+ - Context-aware response generation
51
+ - Prompt engineering
52
+
53
+ ### 5. Evaluation Layer
54
+
55
+ - **Metrics Calculation** (calculatescores.py)
56
+ - RMSE computation
57
+ - AUCROC calculation
58
+ - Context relevance/utilization scoring
59
+
60
+ ### 6. Presentation Layer
61
+
62
+ - **Web Interface** (app.py)
63
+ - Gradio-based UI
64
+ - Interactive model selection
65
+ - Real-time result display
66
+
67
+ ## Data Flow
68
+
69
+ 1. User submits query through Gradio interface
70
+ 2. Query is embedded and searched in Milvus
71
+ 3. Retrieved documents are reranked
72
+ 4. LLM generates response using context
73
+ 5. Response is evaluated and scored
74
+ 6. Results are displayed to user
75
+
76
+ ## Architecture Diagram
77
+
78
+ ```mermaid
79
+ graph TB
80
+ %% User Interface Layer
81
+ UI[Web Interface - Gradio]
82
+
83
+ %% Data Layer
84
+ subgraph Data Layer
85
+ DS[RAGBench Dataset]
86
+ VDB[(Milvus Vector DB)]
87
+ end
88
+
89
+ %% Processing Layer
90
+ subgraph Processing Layer
91
+ DP[Document Processing]
92
+ EG[Embedding Generation]
93
+ style DP fill:#f9f,stroke:#333
94
+ style EG fill:#f9f,stroke:#333
95
+ end
96
+
97
+ %% Search & Retrieval Layer
98
+ subgraph Search & Retrieval
99
+ VS[Vector Search]
100
+ RR[Reranking System]
101
+ style VS fill:#bbf,stroke:#333
102
+ style RR fill:#bbf,stroke:#333
103
+ end
104
+
105
+ %% Generation Layer
106
+ subgraph Generation Layer
107
+ LLM[LLM Models]
108
+ PR[Prompt Engineering]
109
+ style LLM fill:#bfb,stroke:#333
110
+ style PR fill:#bfb,stroke:#333
111
+ end
112
+
113
+ %% Evaluation Layer
114
+ subgraph Evaluation Layer
115
+ ME[Metrics Evaluation]
116
+ SC[Score Calculation]
117
+ style ME fill:#ffb,stroke:#333
118
+ style SC fill:#ffb,stroke:#333
119
+ end
120
+
121
+ %% Flow Connections
122
+ UI --> DP
123
+ DS --> DP
124
+ DP --> EG
125
+ EG --> VDB
126
+ UI --> VS
127
+ VS --> VDB
128
+ VS --> RR
129
+ RR --> LLM
130
+ LLM --> PR
131
+ PR --> ME
132
+ ME --> SC
133
+ SC --> UI
134
+
135
+ %% Model Components
136
+ subgraph Models
137
+ ST[SentenceTransformers]
138
+ RM[Reranking Models]
139
+ GM[Generation Models]
140
+ style ST fill:#dfd,stroke:#333
141
+ style RM fill:#dfd,stroke:#333
142
+ style GM fill:#dfd,stroke:#333
143
+ end
144
+
145
+ %% Model Connections
146
+ EG --> ST
147
+ RR --> RM
148
+ LLM --> GM
149
+
150
+ %% Styling
151
+ classDef default fill:#fff,stroke:#333,stroke-width:2px;
152
+ classDef interface fill:#f96,stroke:#333,stroke-width:2px;
153
+ class UI interface;
154
+ ```
report/finalreport.md CHANGED
@@ -1,46 +1,51 @@
1
- Performance Analysis Report
2
- =========================
3
 
4
  1. **Retrieval Time**:
 
5
  - Milvus + LLaMA: 0.132s
6
  - Weaviate + Mistral: 0.157s
7
  - Milvus + Mistral: NaN
8
 
9
  2. **Context Relevance** (higher is better):
 
10
  - Milvus + LLaMA: 0.640
11
  - Weaviate + Mistral: 0.591
12
  - Milvus + Mistral: 0.518
13
 
14
  3. **Context Utilization** (higher is better):
 
15
  - Milvus + LLaMA: 0.673
16
  - Weaviate + Mistral: 0.619
17
  - Milvus + Mistral: 0.614
18
 
19
  4. **AUCROC** (Area Under ROC Curve):
 
20
  - Milvus + LLaMA: 0.912
21
  - Weaviate + Mistral: 0.750
22
  - Milvus + Mistral: 0.844
23
 
24
  5. **RMSE** (Root Mean Square Error):
25
- - Milvus + LLaMA:
26
- * Context Relevance RMSE: 0.179
27
- * Context Utilization RMSE: 0.302
28
  - Weaviate + Mistral:
29
- * Context Relevance RMSE: 0.414
30
- * Context Utilization RMSE: 0.482
31
  - Milvus + Mistral:
32
- * Context Relevance RMSE: 0.167
33
- * Context Utilization RMSE: 0.258
 
 
34
 
35
- Analysis
36
- --------
37
  1. **Best Overall Performance: Milvus + LLaMA**
 
38
  - Highest AUCROC score (0.912)
39
  - Best context relevance (0.640) and utilization (0.673)
40
  - Fast retrieval time (0.132s)
41
  - Moderate RMSE scores
42
 
43
  2. **Runner-up: Milvus + Mistral**
 
44
  - Second-best AUCROC (0.844)
45
  - Lowest RMSE scores overall
46
  - Lower context relevance and utilization
@@ -52,12 +57,13 @@ Analysis
52
  - Slowest retrieval time (0.157s)
53
  - Moderate context metrics
54
 
55
- Recommendation
56
- -------------
57
  Based on the comprehensive analysis of all metrics, Milvus + LLaMA emerges as the optimal choice for overall performance. It demonstrates:
 
58
  - Superior accuracy (highest AUCROC)
59
  - Better context handling capabilities
60
  - Efficient retrieval speed
61
  - Reasonable error rates
62
 
63
- However, if minimizing error (RMSE) is the primary objective, Milvus + Mistral could be a viable alternative due to its lower error rates in both context relevance and utilization metrics.
 
1
+ # Performance Analysis Report
 
2
 
3
  1. **Retrieval Time**:
4
+
5
  - Milvus + LLaMA: 0.132s
6
  - Weaviate + Mistral: 0.157s
7
  - Milvus + Mistral: NaN
8
 
9
  2. **Context Relevance** (higher is better):
10
+
11
  - Milvus + LLaMA: 0.640
12
  - Weaviate + Mistral: 0.591
13
  - Milvus + Mistral: 0.518
14
 
15
  3. **Context Utilization** (higher is better):
16
+
17
  - Milvus + LLaMA: 0.673
18
  - Weaviate + Mistral: 0.619
19
  - Milvus + Mistral: 0.614
20
 
21
  4. **AUCROC** (Area Under ROC Curve):
22
+
23
  - Milvus + LLaMA: 0.912
24
  - Weaviate + Mistral: 0.750
25
  - Milvus + Mistral: 0.844
26
 
27
  5. **RMSE** (Root Mean Square Error):
28
+ - Milvus + LLaMA:
29
+ - Context Relevance RMSE: 0.179
30
+ - Context Utilization RMSE: 0.302
31
  - Weaviate + Mistral:
32
+ - Context Relevance RMSE: 0.414
33
+ - Context Utilization RMSE: 0.482
34
  - Milvus + Mistral:
35
+ - Context Relevance RMSE: 0.167
36
+ - Context Utilization RMSE: 0.258
37
+
38
+ ## Analysis
39
 
 
 
40
  1. **Best Overall Performance: Milvus + LLaMA**
41
+
42
  - Highest AUCROC score (0.912)
43
  - Best context relevance (0.640) and utilization (0.673)
44
  - Fast retrieval time (0.132s)
45
  - Moderate RMSE scores
46
 
47
  2. **Runner-up: Milvus + Mistral**
48
+
49
  - Second-best AUCROC (0.844)
50
  - Lowest RMSE scores overall
51
  - Lower context relevance and utilization
 
57
  - Slowest retrieval time (0.157s)
58
  - Moderate context metrics
59
 
60
+ ## Recommendation
61
+
62
  Based on the comprehensive analysis of all metrics, Milvus + LLaMA emerges as the optimal choice for overall performance. It demonstrates:
63
+
64
  - Superior accuracy (highest AUCROC)
65
  - Better context handling capabilities
66
  - Efficient retrieval speed
67
  - Reasonable error rates
68
 
69
+ However, if minimizing error (RMSE) is the primary objective, Milvus + Mistral could be a viable alternative due to its lower error rates in both context relevance and utilization metrics.
report/visualizations/correlation_heatmaps.png DELETED
Binary file (561 kB)
 
report/visualizations/milvus_llama_plots.png ADDED
report/visualizations/milvus_mistral_plots.png ADDED
report/visualizations/performance_comparison.png CHANGED
report/visualizations/weaviate_mistral_plots.png ADDED