DeepMostInnovations commited on
Commit
e3247e3
·
verified ·
1 Parent(s): e289938

Upload Hindi embeddings model and all associated files

Browse files
embedding_model.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:343abe4c8b9a9e8f88fca1c5ab8bfc129c1ab2b3852c598e56ae7b3c3dedc1f1
3
- size 1491671832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540a0bf4ad54049eb94815984cad54d3b569bab3a02317e1f0ee306d84f2e1d8
3
+ size 497221224
embedding_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18421d6c53cf3da7ae06f462560fe31c549a3ef90dbc921ecc1c6b1c97f227ff
3
  size 497156468
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f4a7f4984772c76db0f26b34dfae9d7e8b55180a2bf4371733ed1594cf65f9e
3
  size 497156468
hindi-rag-system.py ADDED
@@ -0,0 +1,1052 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import json
4
+ import argparse
5
+ import numpy as np
6
+ import re
7
+ from torch import nn
8
+ from torch.nn import functional as F
9
+ import sentencepiece as spm
10
+ import math
11
+ from safetensors.torch import save_file, load_file
12
+ from tqdm import tqdm
13
+ import faiss
14
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
15
+ from langchain.vectorstores import FAISS as LangchainFAISS
16
+ from langchain.docstore.document import Document
17
+ from langchain.embeddings.base import Embeddings
18
+ from typing import List, Dict, Any, Optional, Callable
19
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
20
+ import gc
21
+ import warnings
22
+
23
+ # Ignore specific HuggingFace warnings
24
+ warnings.filterwarnings("ignore", category=UserWarning, message=".*The model doesn't have tied token embeddings.*")
25
+
26
+ # Tokenizer wrapper class - same as in original code
27
+ class SentencePieceTokenizerWrapper:
28
+ def __init__(self, sp_model_path):
29
+ self.sp_model = spm.SentencePieceProcessor()
30
+ self.sp_model.Load(sp_model_path)
31
+ self.vocab_size = self.sp_model.GetPieceSize()
32
+
33
+ # Special token IDs from tokenizer training
34
+ self.pad_token_id = 0
35
+ self.bos_token_id = 1
36
+ self.eos_token_id = 2
37
+ self.unk_token_id = 3
38
+
39
+ # Set special tokens
40
+ self.pad_token = "<pad>"
41
+ self.bos_token = "<s>"
42
+ self.eos_token = "</s>"
43
+ self.unk_token = "<unk>"
44
+ self.mask_token = "<mask>"
45
+
46
+ def __call__(self, text, padding=False, truncation=False, max_length=None, return_tensors=None):
47
+ # Handle both string and list inputs
48
+ if isinstance(text, str):
49
+ # Encode a single string
50
+ ids = self.sp_model.EncodeAsIds(text)
51
+
52
+ # Handle truncation
53
+ if truncation and max_length and len(ids) > max_length:
54
+ ids = ids[:max_length]
55
+
56
+ attention_mask = [1] * len(ids)
57
+
58
+ # Handle padding
59
+ if padding and max_length:
60
+ padding_length = max(0, max_length - len(ids))
61
+ ids = ids + [self.pad_token_id] * padding_length
62
+ attention_mask = attention_mask + [0] * padding_length
63
+
64
+ result = {
65
+ 'input_ids': ids,
66
+ 'attention_mask': attention_mask
67
+ }
68
+
69
+ # Convert to tensors if requested
70
+ if return_tensors == 'pt':
71
+ import torch
72
+ result = {k: torch.tensor([v]) for k, v in result.items()}
73
+
74
+ return result
75
+
76
+ # Process a batch of texts
77
+ batch_encoded = [self.sp_model.EncodeAsIds(t) for t in text]
78
+
79
+ # Apply truncation if needed
80
+ if truncation and max_length:
81
+ batch_encoded = [ids[:max_length] for ids in batch_encoded]
82
+
83
+ # Create attention masks
84
+ batch_attention_mask = [[1] * len(ids) for ids in batch_encoded]
85
+
86
+ # Apply padding if needed
87
+ if padding:
88
+ if max_length:
89
+ max_len = max_length
90
+ else:
91
+ max_len = max(len(ids) for ids in batch_encoded)
92
+
93
+ # Pad sequences to max_len
94
+ batch_encoded = [ids + [self.pad_token_id] * (max_len - len(ids)) for ids in batch_encoded]
95
+ batch_attention_mask = [mask + [0] * (max_len - len(mask)) for mask in batch_attention_mask]
96
+
97
+ result = {
98
+ 'input_ids': batch_encoded,
99
+ 'attention_mask': batch_attention_mask
100
+ }
101
+
102
+ # Convert to tensors if requested
103
+ if return_tensors == 'pt':
104
+ import torch
105
+ result = {k: torch.tensor(v) for k, v in result.items()}
106
+
107
+ return result
108
+
109
+ # Model architecture definitions for inference
110
+
111
+ class MultiHeadAttention(nn.Module):
112
+ """Advanced multi-headed attention with relative positional encoding"""
113
+ def __init__(self, config):
114
+ super().__init__()
115
+ self.num_attention_heads = config["num_attention_heads"]
116
+ self.attention_head_size = config["hidden_size"] // config["num_attention_heads"]
117
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
118
+
119
+ # Query, Key, Value projections
120
+ self.query = nn.Linear(config["hidden_size"], self.all_head_size)
121
+ self.key = nn.Linear(config["hidden_size"], self.all_head_size)
122
+ self.value = nn.Linear(config["hidden_size"], self.all_head_size)
123
+
124
+ # Output projection
125
+ self.output = nn.Sequential(
126
+ nn.Linear(self.all_head_size, config["hidden_size"]),
127
+ nn.Dropout(config["attention_probs_dropout_prob"])
128
+ )
129
+
130
+ # Simplified relative position bias approach
131
+ self.max_position_embeddings = config["max_position_embeddings"]
132
+ self.relative_attention_bias = nn.Embedding(
133
+ 2 * config["max_position_embeddings"] - 1,
134
+ config["num_attention_heads"]
135
+ )
136
+
137
+ def transpose_for_scores(self, x):
138
+ new_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
139
+ x = x.view(*new_shape)
140
+ return x.permute(0, 2, 1, 3)
141
+
142
+ def forward(self, hidden_states, attention_mask=None):
143
+ batch_size, seq_length = hidden_states.size()[:2]
144
+
145
+ # Project inputs to queries, keys, and values
146
+ query_layer = self.transpose_for_scores(self.query(hidden_states))
147
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
148
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
149
+
150
+ # Take the dot product between query and key to get the raw attention scores
151
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
152
+
153
+ # Generate relative position matrix
154
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device)
155
+ relative_position = position_ids.unsqueeze(1) - position_ids.unsqueeze(0) # [seq_len, seq_len]
156
+ # Shift values to be >= 0
157
+ relative_position = relative_position + self.max_position_embeddings - 1
158
+ # Ensure indices are within bounds
159
+ relative_position = torch.clamp(relative_position, 0, 2 * self.max_position_embeddings - 2)
160
+
161
+ # Get relative position embeddings [seq_len, seq_len, num_heads]
162
+ rel_attn_bias = self.relative_attention_bias(relative_position) # [seq_len, seq_len, num_heads]
163
+
164
+ # Reshape to add to attention heads [1, num_heads, seq_len, seq_len]
165
+ rel_attn_bias = rel_attn_bias.permute(2, 0, 1).unsqueeze(0)
166
+
167
+ # Add to attention scores - now dimensions will match
168
+ attention_scores = attention_scores + rel_attn_bias
169
+
170
+ # Scale attention scores
171
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
172
+
173
+ # Apply attention mask
174
+ if attention_mask is not None:
175
+ attention_scores = attention_scores + attention_mask
176
+
177
+ # Normalize the attention scores to probabilities
178
+ attention_probs = F.softmax(attention_scores, dim=-1)
179
+
180
+ # Apply dropout
181
+ attention_probs = F.dropout(attention_probs, p=0.1, training=self.training)
182
+
183
+ # Apply attention to values
184
+ context_layer = torch.matmul(attention_probs, value_layer)
185
+
186
+ # Reshape back to [batch_size, seq_length, hidden_size]
187
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
188
+ new_shape = context_layer.size()[:-2] + (self.all_head_size,)
189
+ context_layer = context_layer.view(*new_shape)
190
+
191
+ # Final output projection
192
+ output = self.output(context_layer)
193
+
194
+ return output
195
+
196
+ class EnhancedTransformerLayer(nn.Module):
197
+ """Advanced transformer layer with pre-layer norm and enhanced attention"""
198
+ def __init__(self, config):
199
+ super().__init__()
200
+ self.attention_pre_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"])
201
+ self.attention = MultiHeadAttention(config)
202
+
203
+ self.ffn_pre_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"])
204
+
205
+ # Feed-forward network
206
+ self.ffn = nn.Sequential(
207
+ nn.Linear(config["hidden_size"], config["intermediate_size"]),
208
+ nn.GELU(),
209
+ nn.Dropout(config["hidden_dropout_prob"]),
210
+ nn.Linear(config["intermediate_size"], config["hidden_size"]),
211
+ nn.Dropout(config["hidden_dropout_prob"])
212
+ )
213
+
214
+ def forward(self, hidden_states, attention_mask=None):
215
+ # Pre-layer norm for attention
216
+ attn_norm_hidden = self.attention_pre_norm(hidden_states)
217
+
218
+ # Self-attention
219
+ attention_output = self.attention(attn_norm_hidden, attention_mask)
220
+
221
+ # Residual connection
222
+ hidden_states = hidden_states + attention_output
223
+
224
+ # Pre-layer norm for feed-forward
225
+ ffn_norm_hidden = self.ffn_pre_norm(hidden_states)
226
+
227
+ # Feed-forward
228
+ ffn_output = self.ffn(ffn_norm_hidden)
229
+
230
+ # Residual connection
231
+ hidden_states = hidden_states + ffn_output
232
+
233
+ return hidden_states
234
+
235
+ class AdvancedTransformerModel(nn.Module):
236
+ """Advanced Transformer model for inference"""
237
+
238
+ def __init__(self, config):
239
+ super().__init__()
240
+ self.config = config
241
+
242
+ # Embeddings
243
+ self.word_embeddings = nn.Embedding(
244
+ config["vocab_size"],
245
+ config["hidden_size"],
246
+ padding_idx=config["pad_token_id"]
247
+ )
248
+
249
+ # Position embeddings
250
+ self.position_embeddings = nn.Embedding(config["max_position_embeddings"], config["hidden_size"])
251
+
252
+ # Embedding dropout
253
+ self.embedding_dropout = nn.Dropout(config["hidden_dropout_prob"])
254
+
255
+ # Transformer layers
256
+ self.layers = nn.ModuleList([
257
+ EnhancedTransformerLayer(config) for _ in range(config["num_hidden_layers"])
258
+ ])
259
+
260
+ # Final layer norm
261
+ self.final_layer_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"])
262
+
263
+ def forward(self, input_ids, attention_mask=None):
264
+ input_shape = input_ids.size()
265
+ batch_size, seq_length = input_shape
266
+
267
+ # Get position ids
268
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
269
+ position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
270
+
271
+ # Get embeddings
272
+ word_embeds = self.word_embeddings(input_ids)
273
+ position_embeds = self.position_embeddings(position_ids)
274
+
275
+ # Sum embeddings
276
+ embeddings = word_embeds + position_embeds
277
+
278
+ # Apply dropout
279
+ embeddings = self.embedding_dropout(embeddings)
280
+
281
+ # Default attention mask
282
+ if attention_mask is None:
283
+ attention_mask = torch.ones(input_shape, device=input_ids.device)
284
+
285
+ # Extended attention mask for transformer layers (1 for tokens to attend to, 0 for masked tokens)
286
+ extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
287
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
288
+
289
+ # Apply transformer layers
290
+ hidden_states = embeddings
291
+ for layer in self.layers:
292
+ hidden_states = layer(hidden_states, extended_attention_mask)
293
+
294
+ # Final layer norm
295
+ hidden_states = self.final_layer_norm(hidden_states)
296
+
297
+ return hidden_states
298
+
299
+ class AdvancedPooling(nn.Module):
300
+ """Advanced pooling module supporting multiple pooling strategies"""
301
+ def __init__(self, config):
302
+ super().__init__()
303
+ self.pooling_mode = config["pooling_mode"] # 'mean', 'max', 'cls', 'attention'
304
+ self.hidden_size = config["hidden_size"]
305
+
306
+ # For attention pooling
307
+ if self.pooling_mode == 'attention':
308
+ self.attention_weights = nn.Linear(config["hidden_size"], 1)
309
+
310
+ # For weighted pooling
311
+ elif self.pooling_mode == 'weighted':
312
+ self.weight_layer = nn.Linear(config["hidden_size"], 1)
313
+
314
+ def forward(self, token_embeddings, attention_mask=None):
315
+ if attention_mask is None:
316
+ attention_mask = torch.ones_like(token_embeddings[:, :, 0])
317
+
318
+ mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
319
+
320
+ if self.pooling_mode == 'cls':
321
+ # Use [CLS] token (first token)
322
+ pooled = token_embeddings[:, 0]
323
+
324
+ elif self.pooling_mode == 'max':
325
+ # Max pooling
326
+ token_embeddings = token_embeddings.clone()
327
+ # Set padding tokens to large negative value to exclude them from max
328
+ token_embeddings[mask_expanded == 0] = -1e9
329
+ pooled = torch.max(token_embeddings, dim=1)[0]
330
+
331
+ elif self.pooling_mode == 'attention':
332
+ # Attention pooling
333
+ weights = self.attention_weights(token_embeddings).squeeze(-1)
334
+ # Mask out padding tokens
335
+ weights = weights.masked_fill(attention_mask == 0, -1e9)
336
+ weights = F.softmax(weights, dim=1).unsqueeze(-1)
337
+ pooled = torch.sum(token_embeddings * weights, dim=1)
338
+
339
+ elif self.pooling_mode == 'weighted':
340
+ # Weighted average pooling
341
+ weights = torch.sigmoid(self.weight_layer(token_embeddings)).squeeze(-1)
342
+ # Apply mask
343
+ weights = weights * attention_mask
344
+ # Normalize weights
345
+ sum_weights = torch.sum(weights, dim=1, keepdim=True)
346
+ sum_weights = torch.clamp(sum_weights, min=1e-9)
347
+ weights = weights / sum_weights
348
+ # Apply weights
349
+ pooled = torch.sum(token_embeddings * weights.unsqueeze(-1), dim=1)
350
+
351
+ else: # Default to mean pooling
352
+ # Mean pooling
353
+ sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
354
+ sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
355
+ pooled = sum_embeddings / sum_mask
356
+
357
+ # L2 normalize
358
+ pooled = F.normalize(pooled, p=2, dim=1)
359
+
360
+ return pooled
361
+
362
+ class SentenceEmbeddingModel(nn.Module):
363
+ """Complete sentence embedding model for inference"""
364
+ def __init__(self, config):
365
+ super(SentenceEmbeddingModel, self).__init__()
366
+ self.config = config
367
+
368
+ # Create transformer model
369
+ self.transformer = AdvancedTransformerModel(config)
370
+
371
+ # Create pooling module
372
+ self.pooling = AdvancedPooling(config)
373
+
374
+ # Build projection module if needed
375
+ if "projection_dim" in config and config["projection_dim"] > 0:
376
+ self.use_projection = True
377
+ self.projection = nn.Sequential(
378
+ nn.Linear(config["hidden_size"], config["hidden_size"]),
379
+ nn.GELU(),
380
+ nn.Linear(config["hidden_size"], config["projection_dim"]),
381
+ nn.LayerNorm(config["projection_dim"], eps=config["layer_norm_eps"])
382
+ )
383
+ else:
384
+ self.use_projection = False
385
+
386
+ def forward(self, input_ids, attention_mask=None):
387
+ # Get token embeddings from transformer
388
+ token_embeddings = self.transformer(input_ids, attention_mask)
389
+
390
+ # Pool token embeddings
391
+ pooled_output = self.pooling(token_embeddings, attention_mask)
392
+
393
+ # Apply projection if enabled
394
+ if self.use_projection:
395
+ pooled_output = self.projection(pooled_output)
396
+ pooled_output = F.normalize(pooled_output, p=2, dim=1)
397
+
398
+ return pooled_output
399
+
400
+ def convert_to_safetensors(model_path, output_path):
401
+ """Convert PyTorch model to safetensors format"""
402
+ print(f"Converting model from {model_path} to safetensors format...")
403
+
404
+ try:
405
+ # First try with weights_only=False to handle PyTorch 2.6+ checkpoints
406
+ checkpoint = torch.load(model_path, map_location="cpu", weights_only=False)
407
+ print("Successfully loaded checkpoint with weights_only=False")
408
+ except TypeError:
409
+ # For older PyTorch versions that don't have weights_only parameter
410
+ print("Falling back to default torch.load behavior for older PyTorch versions")
411
+ checkpoint = torch.load(model_path, map_location="cpu")
412
+
413
+ # Get model state dict
414
+ if "model_state_dict" in checkpoint:
415
+ state_dict = checkpoint["model_state_dict"]
416
+ print("Extracted model_state_dict from checkpoint")
417
+ else:
418
+ state_dict = checkpoint
419
+ print("Using entire checkpoint as state_dict")
420
+
421
+ # Save as safetensors
422
+ save_file(state_dict, output_path)
423
+ print(f"Model converted and saved to {output_path}")
424
+
425
+ def load_model_and_tokenizer(model_dir, tokenizer_dir="/home/ubuntu/hindi_tokenizer"):
426
+ """Load the model and tokenizer for inference"""
427
+
428
+ # Load the config
429
+ config_path = os.path.join(model_dir, "config.json")
430
+ with open(config_path, "r") as f:
431
+ config = json.load(f)
432
+
433
+ # Load the tokenizer - use specified tokenizer directory
434
+ tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.model")
435
+ if not os.path.exists(tokenizer_path):
436
+ # Try other locations
437
+ tokenizer_path = os.path.join(model_dir, "tokenizer.model")
438
+ if not os.path.exists(tokenizer_path):
439
+ raise FileNotFoundError(f"Could not find tokenizer model at {tokenizer_path}")
440
+
441
+ tokenizer = SentencePieceTokenizerWrapper(tokenizer_path)
442
+ print(f"Loaded tokenizer from {tokenizer_path} with vocabulary size: {tokenizer.vocab_size}")
443
+
444
+ # Load the model
445
+ safetensors_path = os.path.join(model_dir, "embedding_model.safetensors")
446
+
447
+ if not os.path.exists(safetensors_path):
448
+ print(f"Safetensors model not found at {safetensors_path}, converting from PyTorch checkpoint...")
449
+
450
+ # Convert from PyTorch checkpoint
451
+ pytorch_path = os.path.join(model_dir, "embedding_model.pt")
452
+ if not os.path.exists(pytorch_path):
453
+ raise FileNotFoundError(f"Could not find PyTorch model at {pytorch_path}")
454
+
455
+ convert_to_safetensors(pytorch_path, safetensors_path)
456
+
457
+ # Load state dict from safetensors
458
+ state_dict = load_file(safetensors_path)
459
+
460
+ # Create model
461
+ model = SentenceEmbeddingModel(config)
462
+
463
+ # Load state dict
464
+ try:
465
+ # Try direct loading
466
+ missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
467
+ print(f"Loaded model with missing keys: {missing_keys[:10]}{'...' if len(missing_keys) > 10 else ''}")
468
+ print(f"Unexpected keys: {unexpected_keys[:10]}{'...' if len(unexpected_keys) > 10 else ''}")
469
+ except Exception as e:
470
+ print(f"Error loading state dict: {e}")
471
+ print("Model will be initialized with random weights")
472
+
473
+ model.eval()
474
+
475
+ return model, tokenizer, config
476
+
477
+ # LangChain Custom Embeddings Class
478
+ class HindiSentenceEmbeddings(Embeddings):
479
+ """
480
+ Custom Langchain Embeddings class for Hindi sentence embeddings model
481
+ """
482
+ def __init__(self, model, tokenizer, device="cuda", batch_size=32, max_length=128):
483
+ """Initialize with model, tokenizer, and inference parameters"""
484
+ self.model = model
485
+ self.tokenizer = tokenizer
486
+ self.device = device
487
+ self.batch_size = batch_size
488
+ self.max_length = max_length
489
+
490
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
491
+ """Embed a list of documents/texts"""
492
+ embeddings = []
493
+
494
+ with torch.no_grad():
495
+ for i in range(0, len(texts), self.batch_size):
496
+ batch = texts[i:i+self.batch_size]
497
+
498
+ # Tokenize
499
+ inputs = self.tokenizer(
500
+ batch,
501
+ padding="max_length",
502
+ truncation=True,
503
+ max_length=self.max_length,
504
+ return_tensors="pt"
505
+ )
506
+
507
+ # Move to device
508
+ input_ids = inputs["input_ids"].to(self.device)
509
+ attention_mask = inputs["attention_mask"].to(self.device)
510
+
511
+ # Get embeddings
512
+ batch_embeddings = self.model(input_ids, attention_mask)
513
+
514
+ # Move to CPU and convert to numpy
515
+ batch_embeddings = batch_embeddings.cpu().numpy()
516
+ embeddings.append(batch_embeddings)
517
+
518
+ return np.vstack(embeddings).tolist()
519
+
520
+ def embed_query(self, text: str) -> List[float]:
521
+ """Embed a single query/text"""
522
+ return self.embed_documents([text])[0]
523
+
524
+ def extract_relevant_sentences(text, query, window_size=2):
525
+ """
526
+ Extract the most relevant sentences from text based on query keywords
527
+
528
+ Args:
529
+ text: The full text content
530
+ query: The user's query
531
+ window_size: Number of sentences to include before and after matched sentence
532
+
533
+ Returns:
534
+ String containing the most relevant portion of the text
535
+ """
536
+ # Clean and normalize query and text for matching
537
+ query = query.strip().lower()
538
+
539
+ # Remove question marks and other punctuation from query for matching
540
+ query = re.sub(r'[?।॥!,.:]', '', query)
541
+
542
+ # Extract keywords from the query (remove common Hindi stop words)
543
+ stop_words = ['और', 'का', 'के', 'को', 'में', 'से', 'है', 'हैं', 'था', 'थे', 'की', 'कि', 'पर', 'एक', 'यह', 'वह', 'जो', 'ने', 'हो', 'कर']
544
+ query_terms = [word for word in query.split() if word not in stop_words]
545
+
546
+ if not query_terms:
547
+ return text # If no meaningful terms left, return the full text
548
+
549
+ # Split text into sentences (using Hindi sentence terminators)
550
+ sentences = re.split(r'([।॥!?.])', text)
551
+
552
+ # Rejoin sentences with their terminators
553
+ complete_sentences = []
554
+ for i in range(0, len(sentences)-1, 2):
555
+ if i+1 < len(sentences):
556
+ complete_sentences.append(sentences[i] + sentences[i+1])
557
+ else:
558
+ complete_sentences.append(sentences[i])
559
+
560
+ # If the above didn't work properly, try simpler approach
561
+ if len(complete_sentences) <= 1:
562
+ complete_sentences = re.split(r'[।॥!?.]', text)
563
+ complete_sentences = [s.strip() for s in complete_sentences if s.strip()]
564
+
565
+ # Score each sentence based on how many query terms it contains
566
+ sentence_scores = []
567
+ for i, sentence in enumerate(complete_sentences):
568
+ sentence_lower = sentence.lower()
569
+ # Calculate score based on number of query terms found
570
+ score = sum(1 for term in query_terms if term in sentence_lower)
571
+ sentence_scores.append((i, score))
572
+
573
+ # Find the best matching sentence
574
+ if not sentence_scores:
575
+ return text[:500] + "..." # Fallback
576
+
577
+ # Get the index of sentence with highest score
578
+ best_match_idx, best_score = max(sentence_scores, key=lambda x: x[1])
579
+
580
+ # If no good match found, return the whole text (up to a limit)
581
+ if best_score == 0:
582
+ # Try partial word matching as a fallback
583
+ for i, sentence in enumerate(complete_sentences):
584
+ sentence_lower = sentence.lower()
585
+ partial_score = sum(1 for term in query_terms if any(term in word.lower() for word in sentence_lower.split()))
586
+ if partial_score > 0:
587
+ best_match_idx = i
588
+ break
589
+ else:
590
+ # If still no match, just return the first part of the text
591
+ if len(text) > 1000:
592
+ return text[:1000] + "..."
593
+ return text
594
+
595
+ # Get window of sentences around the best match
596
+ start_idx = max(0, best_match_idx - window_size)
597
+ end_idx = min(len(complete_sentences), best_match_idx + window_size + 1)
598
+
599
+ # Create excerpt
600
+ relevant_text = ' '.join(complete_sentences[start_idx:end_idx])
601
+
602
+ # If the excerpt is short, return more context
603
+ if len(relevant_text) < 100 and len(text) > len(relevant_text):
604
+ # Add more context
605
+ if end_idx < len(complete_sentences):
606
+ relevant_text += ' ' + ' '.join(complete_sentences[end_idx:end_idx+2])
607
+ if start_idx > 0:
608
+ relevant_text = ' '.join(complete_sentences[max(0, start_idx-2):start_idx]) + ' ' + relevant_text
609
+
610
+ # If the excerpt is too short or the whole text is small anyway, return whole text
611
+ if len(relevant_text) < 50 or len(text) < 1000:
612
+ return text
613
+
614
+ return relevant_text
615
+
616
+ # Text processing and indexing functions
617
+ def load_and_process_text_file(file_path, chunk_size=500, chunk_overlap=100):
618
+ """
619
+ Load a text file and split it into semantically meaningful chunks
620
+ """
621
+ print(f"Loading and processing text file: {file_path}")
622
+
623
+ # Read the file content
624
+ with open(file_path, 'r', encoding='utf-8') as f:
625
+ content = f.read()
626
+
627
+ # For small files, just keep the whole content as a single chunk
628
+ if len(content) <= chunk_size * 2:
629
+ print(f"File content is small, keeping as a single chunk")
630
+ return [Document(
631
+ page_content=content,
632
+ metadata={
633
+ "source": file_path,
634
+ "chunk_id": 0
635
+ }
636
+ )]
637
+
638
+ # Split by paragraphs first
639
+ paragraphs = re.split(r'\n\s*\n', content)
640
+ chunks = []
641
+
642
+ current_chunk = ""
643
+ current_size = 0
644
+
645
+ for para in paragraphs:
646
+ if not para.strip():
647
+ continue
648
+
649
+ # If adding this paragraph would exceed the chunk size, save current chunk and start new one
650
+ if current_size + len(para) > chunk_size and current_size > 0:
651
+ chunks.append(current_chunk)
652
+ current_chunk = para
653
+ current_size = len(para)
654
+ else:
655
+ # Add paragraph to current chunk with a newline if not empty
656
+ if current_size > 0:
657
+ current_chunk += "\n\n" + para
658
+ else:
659
+ current_chunk = para
660
+ current_size = len(current_chunk)
661
+
662
+ # Add the last chunk if not empty
663
+ if current_chunk:
664
+ chunks.append(current_chunk)
665
+
666
+ print(f"Split text into {len(chunks)} chunks")
667
+
668
+ # Convert to LangChain documents with metadata
669
+ documents = [
670
+ Document(
671
+ page_content=chunk,
672
+ metadata={
673
+ "source": file_path,
674
+ "chunk_id": i
675
+ }
676
+ ) for i, chunk in enumerate(chunks)
677
+ ]
678
+
679
+ return documents
680
+
681
+ def create_vector_store(documents, embeddings, store_path=None):
682
+ """
683
+ Create a FAISS vector store from documents using the given embeddings
684
+ """
685
+ print("Creating FAISS vector store...")
686
+
687
+ # Create vector store
688
+ vector_store = LangchainFAISS.from_documents(documents, embeddings)
689
+
690
+ # Save if path is provided
691
+ if store_path:
692
+ print(f"Saving vector store to {store_path}")
693
+ vector_store.save_local(store_path)
694
+
695
+ return vector_store
696
+
697
+ def load_vector_store(store_path, embeddings):
698
+ """
699
+ Load a FAISS vector store from disk
700
+ """
701
+ print(f"Loading vector store from {store_path}")
702
+ return LangchainFAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True)
703
+
704
+ def perform_similarity_search(vector_store, query, k=6):
705
+ """
706
+ Perform basic similarity search on the vector store
707
+ """
708
+ print(f"Searching for: {query}")
709
+ return vector_store.similarity_search_with_score(query, k=k)
710
+
711
+ # Llama model loading function
712
+ def load_llama_model(model_name="unsloth/Llama-3.2-1B-Instruct", device="cuda"):
713
+ """
714
+ Load and prepare Llama model for text generation
715
+ """
716
+ print(f"Loading LLM: {model_name}")
717
+
718
+ # Check if CUDA is available
719
+ if device == "cuda" and not torch.cuda.is_available():
720
+ print("CUDA not available, falling back to CPU")
721
+ device = "cpu"
722
+
723
+ # Quantization config for 4-bit precision to save memory
724
+ quantization = BitsAndBytesConfig(
725
+ load_in_4bit=True,
726
+ bnb_4bit_compute_dtype=torch.float16,
727
+ bnb_4bit_quant_type="nf4",
728
+ bnb_4bit_use_double_quant=True,
729
+ ) if device == "cuda" else None
730
+
731
+ # Standard HuggingFace loading
732
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
733
+ if device == "cuda":
734
+ model = AutoModelForCausalLM.from_pretrained(
735
+ model_name,
736
+ device_map="auto",
737
+ quantization_config=quantization
738
+ )
739
+ else:
740
+ model = AutoModelForCausalLM.from_pretrained(model_name)
741
+ model = model.to(device)
742
+
743
+ print("Successfully loaded model")
744
+
745
+ return model, tokenizer
746
+
747
+ def setup_qa_system(model, tokenizer, vector_store):
748
+ """
749
+ Set up a direct QA system using the model and retriever
750
+ """
751
+ # Create retriever
752
+ retriever = vector_store.as_retriever(
753
+ search_type="similarity",
754
+ search_kwargs={"k": 3}
755
+ )
756
+
757
+ # Create a function to generate answers
758
+ def generate_answer(query):
759
+ # Retrieve documents
760
+ try:
761
+ docs = retriever.invoke(query)
762
+ except:
763
+ # Fallback to older method if invoke isn't available
764
+ docs = retriever.get_relevant_documents(query)
765
+
766
+ # Extract the content
767
+ context = "\n\n".join([doc.page_content for doc in docs])
768
+
769
+ # Create prompt
770
+ prompt = f"""
771
+ आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर द���ना है।
772
+ यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें।
773
+
774
+ संदर्भ:
775
+ {context}
776
+
777
+ प्रश्न: {query}
778
+
779
+ उत्तर:
780
+ """
781
+
782
+ # Generate text
783
+ inputs = tokenizer(prompt, return_tensors="pt")
784
+
785
+ # Move to the same device as the model
786
+ for k, v in inputs.items():
787
+ if hasattr(v, "to") and callable(v.to):
788
+ inputs[k] = v.to(model.device)
789
+
790
+ with torch.no_grad():
791
+ try:
792
+ outputs = model.generate(
793
+ inputs.input_ids,
794
+ max_new_tokens=512,
795
+ temperature=0.7,
796
+ top_p=0.9,
797
+ do_sample=True
798
+ )
799
+ except Exception as e:
800
+ return f"Error generating response: {str(e)}"
801
+
802
+ # Decode the generated text
803
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
804
+
805
+ # Extract just the answer part (after the prompt)
806
+ answer = full_response.split("उत्तर:")[-1].strip()
807
+
808
+ return answer
809
+
810
+ return generate_answer
811
+
812
+ # Main RAG functions
813
+ def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
814
+ """
815
+ Index text files from a directory and create a FAISS vector store
816
+ """
817
+ print(f"Indexing text files from {data_dir} with chunk size ({chunk_size}) for fine-grained retrieval")
818
+
819
+ # Create embedding model
820
+ embeddings = HindiSentenceEmbeddings(model, tokenizer, device=device)
821
+
822
+ # Create output directory if it doesn't exist
823
+ os.makedirs(output_dir, exist_ok=True)
824
+
825
+ # Get all text files
826
+ text_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.txt')]
827
+ print(f"Found {len(text_files)} text files")
828
+
829
+ # Process all text files
830
+ all_documents = []
831
+ for file_path in text_files:
832
+ documents = load_and_process_text_file(file_path, chunk_size=chunk_size)
833
+ all_documents.extend(documents)
834
+
835
+ print(f"Total documents: {len(all_documents)}")
836
+
837
+ # If we don't have enough chunks, reduce chunk size and try again
838
+ if len(all_documents) < 10 and chunk_size > 50:
839
+ print(f"Not enough chunks created. Reducing chunk size and trying again...")
840
+ return index_text_files(model, tokenizer, data_dir, output_dir, device, chunk_size=chunk_size//2)
841
+
842
+ # Create and save vector store
843
+ vector_store_path = os.path.join(output_dir, "faiss_index")
844
+ vector_store = create_vector_store(all_documents, embeddings, vector_store_path)
845
+
846
+ return vector_store, embeddings
847
+
848
+ def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="cuda"):
849
+ """
850
+ Query the text corpus using the indexed vector store
851
+ """
852
+ # Create embedding model
853
+ embeddings = HindiSentenceEmbeddings(model, tokenizer, device=device)
854
+
855
+ # Load vector store
856
+ vector_store = load_vector_store(vector_store_path, embeddings)
857
+
858
+ # Perform similarity search
859
+ results = perform_similarity_search(vector_store, query, k=k)
860
+
861
+ # Post-process results to combine adjacent chunks if they're from the same source
862
+ processed_results = []
863
+ seen_chunks = set()
864
+
865
+ for doc, score in results:
866
+ chunk_id = doc.metadata["chunk_id"]
867
+ source = doc.metadata["source"]
868
+
869
+ # Skip if we've already included this chunk
870
+ if (source, chunk_id) in seen_chunks:
871
+ continue
872
+
873
+ seen_chunks.add((source, chunk_id))
874
+
875
+ # Try to find adjacent chunks and combine them
876
+ combined_content = doc.page_content
877
+
878
+ # Look for adjacent chunks in results (both previous and next)
879
+ for adj_id in [chunk_id-1, chunk_id+1]:
880
+ for other_doc, _ in results:
881
+ if (other_doc.metadata["source"] == source and
882
+ other_doc.metadata["chunk_id"] == adj_id and
883
+ (source, adj_id) not in seen_chunks):
884
+
885
+ # Add the adjacent chunk content
886
+ if adj_id < chunk_id: # Previous chunk
887
+ combined_content = other_doc.page_content + " " + combined_content
888
+ else: # Next chunk
889
+ combined_content = combined_content + " " + other_doc.page_content
890
+
891
+ seen_chunks.add((source, adj_id))
892
+
893
+ # Create a new document with combined content
894
+ combined_doc = Document(
895
+ page_content=combined_content,
896
+ metadata={
897
+ "source": source,
898
+ "chunk_id": chunk_id,
899
+ "is_combined": True if combined_content != doc.page_content else False
900
+ }
901
+ )
902
+
903
+ processed_results.append((combined_doc, score))
904
+
905
+ return processed_results, vector_store
906
+
907
+ def main():
908
+ parser = argparse.ArgumentParser(description="Hindi RAG System with LangChain and FAISS")
909
+ parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
910
+ help="Directory containing the model and tokenizer")
911
+ parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
912
+ help="Directory containing the tokenizer")
913
+ parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
914
+ help="Device to run inference on ('cuda' or 'cpu')")
915
+ parser.add_argument("--index", action="store_true",
916
+ help="Index text files from data directory")
917
+ parser.add_argument("--query", type=str, default=None,
918
+ help="Query to search in the indexed corpus")
919
+ parser.add_argument("--data_dir", type=str, default="./data",
920
+ help="Directory containing text files for indexing")
921
+ parser.add_argument("--output_dir", type=str, default="./output",
922
+ help="Directory to save the indexed vector store")
923
+ parser.add_argument("--top_k", type=int, default=6,
924
+ help="Number of top results to return")
925
+ parser.add_argument("--chunk_size", type=int, default=500,
926
+ help="Size of text chunks for indexing")
927
+ parser.add_argument("--interactive", action="store_true",
928
+ help="Run in interactive mode for querying")
929
+ parser.add_argument("--reindex", action="store_true",
930
+ help="Force reindexing even if index exists")
931
+ parser.add_argument("--qa", action="store_true",
932
+ help="Use LLM for question answering instead of just retrieval")
933
+ parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
934
+ help="HuggingFace model name for the LLM")
935
+ args = parser.parse_args()
936
+
937
+ # Load embedding model and tokenizer
938
+ embed_model, embed_tokenizer, config = load_model_and_tokenizer(args.model_dir, args.tokenizer_dir)
939
+
940
+ # Move embedding model to device
941
+ embed_model = embed_model.to(args.device)
942
+
943
+ # Create vector store path
944
+ vector_store_path = os.path.join(args.output_dir, "faiss_index")
945
+
946
+ # Load LLM if QA is enabled
947
+ llm_model = None
948
+ llm_tokenizer = None
949
+ qa_generator = None
950
+
951
+ if args.qa:
952
+ try:
953
+ # Load LLM
954
+ llm_model, llm_tokenizer = load_llama_model(args.llm_name, args.device)
955
+ print("LLM loaded successfully for QA")
956
+ except Exception as e:
957
+ print(f"Error loading LLM: {e}")
958
+ print("Falling back to retrieval-only mode")
959
+ args.qa = False
960
+
961
+ if args.index or args.reindex:
962
+ # Index text files
963
+ vector_store, _ = index_text_files(
964
+ embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
965
+ )
966
+ print(f"Indexing complete. Vector store saved to {vector_store_path}")
967
+
968
+ # Set up QA chain if enabled
969
+ if args.qa and llm_model is not None and llm_tokenizer is not None:
970
+ qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
971
+
972
+ if args.query:
973
+ # Query the corpus
974
+ results, vector_store = query_text_corpus(
975
+ embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
976
+ )
977
+
978
+ # Print retrieval results
979
+ print("\nSearch Results:")
980
+ for i, (doc, score) in enumerate(results):
981
+ print(f"\nResult {i+1} (Score: {score:.4f}):")
982
+ print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
983
+
984
+ # Extract and print only relevant sentences
985
+ relevant_text = extract_relevant_sentences(doc.page_content, args.query)
986
+ print(f"Content: {relevant_text}")
987
+
988
+ # If QA is enabled, also answer the question using the LLM
989
+ if args.qa and llm_model is not None and llm_tokenizer is not None:
990
+ if qa_generator is None:
991
+ qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
992
+
993
+ # Get answer from QA chain
994
+ print("\nGenerating answer using LLM...")
995
+ try:
996
+ answer = qa_generator(args.query)
997
+ print("\nLLM Answer:")
998
+ print(answer)
999
+ except Exception as e:
1000
+ print(f"Error generating answer: {e}")
1001
+
1002
+ if args.interactive:
1003
+ print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
1004
+
1005
+ # For the first query, load vector store
1006
+ vector_store = None
1007
+
1008
+ while True:
1009
+ print("\nEnter query:")
1010
+ query = input()
1011
+
1012
+ if not query.strip():
1013
+ continue
1014
+
1015
+ if query.lower() == 'quit':
1016
+ break
1017
+
1018
+ # Query the corpus
1019
+ results, vector_store = query_text_corpus(
1020
+ embed_model, embed_tokenizer, vector_store_path, query, args.top_k, args.device
1021
+ )
1022
+
1023
+ # Print retrieval results
1024
+ print("\nSearch Results:")
1025
+ for i, (doc, score) in enumerate(results):
1026
+ print(f"\nResult {i+1} (Score: {score:.4f}):")
1027
+ print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
1028
+
1029
+ # Extract and print only relevant sentences
1030
+ relevant_text = extract_relevant_sentences(doc.page_content, query)
1031
+ print(f"Content: {relevant_text}")
1032
+
1033
+ # If QA is enabled, also answer the question using the LLM
1034
+ if args.qa and llm_model is not None and llm_tokenizer is not None:
1035
+ if qa_generator is None:
1036
+ qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
1037
+
1038
+ print("\nGenerating answer using LLM...")
1039
+ try:
1040
+ answer = qa_generator(query)
1041
+ print("\nLLM Answer:")
1042
+ print(answer)
1043
+ except Exception as e:
1044
+ print(f"Error generating answer: {e}")
1045
+
1046
+ # Clean up GPU memory
1047
+ if args.device == "cuda":
1048
+ gc.collect()
1049
+ torch.cuda.empty_cache()
1050
+
1051
+ if __name__ == "__main__":
1052
+ main()