Spaces:
Sleeping
Sleeping
aayushraina
commited on
Upload 13 files
Browse files- .gitattributes +1 -2
- README.md +114 -9
- app.py +136 -0
- app_gradio.py +123 -0
- byte_pair_encoder.py +222 -0
- data_loader.py +114 -0
- download_dataset.py +184 -0
- hindi_bpe.py +129 -0
- requirements.txt +11 -0
- test_hindi_bpe.py +148 -0
- tokenizer/encoder.json +3 -0
- tokenizer/vocab_stats.json +3 -0
- tokenizer_config.json +3 -0
.gitattributes
CHANGED
@@ -23,9 +23,7 @@
|
|
23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
@@ -33,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
26 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
27 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
28 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
29 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,13 +1,118 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
10 |
license: mit
|
|
|
|
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
language: hi
|
3 |
+
tags:
|
4 |
+
- hindi
|
5 |
+
- tokenizer
|
6 |
+
- bpe
|
7 |
+
- subword
|
8 |
+
- text-processing
|
9 |
+
pipeline_tag: text2text-generation
|
10 |
+
inference: true
|
11 |
license: mit
|
12 |
+
spaces:
|
13 |
+
- aayushraina/bpe-hindi
|
14 |
---
|
15 |
|
16 |
+
# Hindi Byte Pair Encoding (BPE) Tokenizer
|
17 |
+
|
18 |
+
A specialized BPE tokenizer for Hindi text that achieves efficient compression while maintaining linguistic coherence.
|
19 |
+
|
20 |
+
## Online Demo
|
21 |
+
|
22 |
+
Try the tokenizer in your browser: [Hindi BPE Tokenizer Demo](https://huggingface.co/spaces/aayushraina/bpe-hindi)
|
23 |
+
|
24 |
+
## Project Overview
|
25 |
+
|
26 |
+
This project implements a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text. It features:
|
27 |
+
- Efficient trie-based tokenization
|
28 |
+
- Visualization of training progress
|
29 |
+
- Compression ratio optimization
|
30 |
+
- Support for large Hindi text datasets
|
31 |
+
- Hugging Face compatibility
|
32 |
+
|
33 |
+
## Project Structure
|
34 |
+
hindi-bpe/
|
35 |
+
├── data/ # Dataset directory
|
36 |
+
│ ├── train/ # Training data
|
37 |
+
│ └── valid/ # Validation data
|
38 |
+
├── tokenizer/ # Saved tokenizer files
|
39 |
+
│ ├── encoder.json # Encoder state
|
40 |
+
│ └── vocab_stats.json # Vocabulary statistics
|
41 |
+
├── output/ # Visualization outputs
|
42 |
+
├── byte_pair_encoder.py # Core BPE implementation
|
43 |
+
├── hindi_bpe.py # Hindi-specific wrapper
|
44 |
+
├── test_hindi_bpe.py # Test suite
|
45 |
+
└── requirements.txt # Dependencies
|
46 |
+
|
47 |
+
## Training stats
|
48 |
+
- Iteration 4500:
|
49 |
+
- Vocabulary size: 4,477
|
50 |
+
- Data size: 448,754
|
51 |
+
- Compression ratio: 3.66
|
52 |
+
- Max token length: 64
|
53 |
+
|
54 |
+
## File Descriptions
|
55 |
+
|
56 |
+
1. **byte_pair_encoder.py**
|
57 |
+
- Core BPE implementation
|
58 |
+
- Trie-based tokenization
|
59 |
+
- Training statistics tracking
|
60 |
+
- Visualization utilities
|
61 |
+
|
62 |
+
2. **hindi_bpe.py**
|
63 |
+
- Hindi-specific tokenizer wrapper
|
64 |
+
- Text preprocessing
|
65 |
+
- Model saving/loading
|
66 |
+
- Compression ratio calculation
|
67 |
+
|
68 |
+
3. **app.py**
|
69 |
+
- Interactive web interface
|
70 |
+
- Real-time tokenization
|
71 |
+
- Training visualization
|
72 |
+
- Model parameter tuning
|
73 |
+
|
74 |
+
4. **test_hindi_bpe.py**
|
75 |
+
- Test suite for tokenizer
|
76 |
+
- Performance benchmarks
|
77 |
+
- Example usage
|
78 |
+
|
79 |
+
## Installation
|
80 |
+
- bash
|
81 |
+
- Clone repository
|
82 |
+
- git clone https://github.com/yourusername/hindi-bpe.git
|
83 |
+
- cd hindi-bpe
|
84 |
+
- pip install -r requirements.txt
|
85 |
+
|
86 |
+
## Download and prepare dataset
|
87 |
+
- python download_dataset.py
|
88 |
+
|
89 |
+
### Web Interface
|
90 |
+
- streamlit run app.py
|
91 |
+
|
92 |
+
### Test-
|
93 |
+
- python test_hindi_bpe.py
|
94 |
+
- The test suite includes:
|
95 |
+
- Training pipeline verification
|
96 |
+
- Compression ratio validation
|
97 |
+
- Token count requirements
|
98 |
+
- Encoding/decoding accuracy
|
99 |
+
|
100 |
+
## Performance Metrics
|
101 |
+
|
102 |
+
The tokenizer aims to achieve:
|
103 |
+
- Vocabulary size < 5000 tokens
|
104 |
+
- Compression ratio ≥ 3.2
|
105 |
+
- Fast encoding/decoding
|
106 |
+
- Memory-efficient operation
|
107 |
+
|
108 |
+
## Contributing
|
109 |
+
|
110 |
+
1. Fork the repository
|
111 |
+
2. Create feature branch
|
112 |
+
3. Commit changes
|
113 |
+
4. Push to branch
|
114 |
+
5. Create Pull Request
|
115 |
+
|
116 |
+
## License
|
117 |
+
|
118 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
app.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from huggingface_hub import snapshot_download
|
3 |
+
from hindi_bpe import HindiBPE, preprocess_hindi_text
|
4 |
+
import pandas as pd
|
5 |
+
import plotly.express as px
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Download tokenizer if not exists
|
9 |
+
if not os.path.exists("tokenizer"):
|
10 |
+
snapshot_download(
|
11 |
+
repo_id="aayushraina/bpe-hindi",
|
12 |
+
local_dir="tokenizer",
|
13 |
+
allow_patterns=["*.json"]
|
14 |
+
)
|
15 |
+
|
16 |
+
class TokenizerDemo:
|
17 |
+
def __init__(self):
|
18 |
+
self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
|
19 |
+
|
20 |
+
def tokenize_text(self, text: str) -> tuple:
|
21 |
+
"""Tokenize text and return visualization"""
|
22 |
+
if not text:
|
23 |
+
return "", None, "Please enter some text"
|
24 |
+
|
25 |
+
# Preprocess
|
26 |
+
text = preprocess_hindi_text(text)
|
27 |
+
|
28 |
+
# Tokenize
|
29 |
+
tokens = self.tokenizer.encode(text)
|
30 |
+
|
31 |
+
# Create visualization
|
32 |
+
token_df = pd.DataFrame({
|
33 |
+
'Token': tokens,
|
34 |
+
'Length': [len(token) for token in tokens]
|
35 |
+
})
|
36 |
+
|
37 |
+
fig = px.scatter(token_df,
|
38 |
+
x=range(len(tokens)),
|
39 |
+
y='Length',
|
40 |
+
hover_data=['Token'],
|
41 |
+
title='Token Lengths in Sequence')
|
42 |
+
|
43 |
+
# Calculate statistics
|
44 |
+
stats = {
|
45 |
+
'Total Tokens': len(tokens),
|
46 |
+
'Unique Tokens': len(set(tokens)),
|
47 |
+
'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
|
48 |
+
'Compression Ratio': len(text) / sum(len(t) for t in tokens)
|
49 |
+
}
|
50 |
+
|
51 |
+
stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
|
52 |
+
for k, v in stats.items())
|
53 |
+
|
54 |
+
return (
|
55 |
+
" ".join(tokens), # Tokenized text
|
56 |
+
fig, # Visualization
|
57 |
+
stats_str # Statistics
|
58 |
+
)
|
59 |
+
|
60 |
+
def decode_tokens(self, tokens_text: str) -> str:
|
61 |
+
"""Decode space-separated tokens back to text"""
|
62 |
+
if not tokens_text:
|
63 |
+
return "Please tokenize some text first"
|
64 |
+
tokens = tokens_text.split()
|
65 |
+
return self.tokenizer.decode(tokens)
|
66 |
+
|
67 |
+
# Create Gradio interface
|
68 |
+
demo = TokenizerDemo()
|
69 |
+
|
70 |
+
interface = gr.Blocks(title="Hindi BPE Tokenizer")
|
71 |
+
|
72 |
+
with interface:
|
73 |
+
gr.Markdown("""
|
74 |
+
# Hindi BPE Tokenizer Demo
|
75 |
+
|
76 |
+
This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
|
77 |
+
Enter Hindi text to see how it gets tokenized and analyze the token distribution.
|
78 |
+
|
79 |
+
[View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
|
80 |
+
""")
|
81 |
+
|
82 |
+
with gr.Row():
|
83 |
+
with gr.Column():
|
84 |
+
input_text = gr.Textbox(
|
85 |
+
label="Input Hindi Text",
|
86 |
+
placeholder="हिंदी में टेक्स्ट दर्ज करें...",
|
87 |
+
lines=5
|
88 |
+
)
|
89 |
+
tokenize_btn = gr.Button("Tokenize")
|
90 |
+
|
91 |
+
with gr.Column():
|
92 |
+
tokens_output = gr.Textbox(
|
93 |
+
label="Tokenized Output",
|
94 |
+
lines=5
|
95 |
+
)
|
96 |
+
decode_btn = gr.Button("Decode")
|
97 |
+
|
98 |
+
original_output = gr.Textbox(
|
99 |
+
label="Decoded Text",
|
100 |
+
lines=5
|
101 |
+
)
|
102 |
+
|
103 |
+
stats_output = gr.Textbox(
|
104 |
+
label="Tokenization Statistics",
|
105 |
+
lines=4
|
106 |
+
)
|
107 |
+
|
108 |
+
plot_output = gr.Plot(
|
109 |
+
label="Token Length Distribution"
|
110 |
+
)
|
111 |
+
|
112 |
+
# Set up event handlers
|
113 |
+
tokenize_btn.click(
|
114 |
+
fn=demo.tokenize_text,
|
115 |
+
inputs=input_text,
|
116 |
+
outputs=[tokens_output, plot_output, stats_output]
|
117 |
+
)
|
118 |
+
|
119 |
+
decode_btn.click(
|
120 |
+
fn=demo.decode_tokens,
|
121 |
+
inputs=tokens_output,
|
122 |
+
outputs=original_output
|
123 |
+
)
|
124 |
+
|
125 |
+
# Add examples
|
126 |
+
gr.Examples(
|
127 |
+
examples=[
|
128 |
+
["हिंदी भाषा बहुत सुंदर है।"],
|
129 |
+
["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
|
130 |
+
["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
|
131 |
+
],
|
132 |
+
inputs=input_text
|
133 |
+
)
|
134 |
+
|
135 |
+
# Launch the interface
|
136 |
+
interface.launch()
|
app_gradio.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from hindi_bpe import HindiBPE, preprocess_hindi_text
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.express as px
|
5 |
+
import json
|
6 |
+
|
7 |
+
class TokenizerDemo:
|
8 |
+
def __init__(self):
|
9 |
+
self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
|
10 |
+
|
11 |
+
def tokenize_text(self, text: str) -> tuple:
|
12 |
+
"""Tokenize text and return visualization"""
|
13 |
+
# Preprocess
|
14 |
+
text = preprocess_hindi_text(text)
|
15 |
+
|
16 |
+
# Tokenize
|
17 |
+
tokens = self.tokenizer.encode(text)
|
18 |
+
|
19 |
+
# Create visualization
|
20 |
+
token_df = pd.DataFrame({
|
21 |
+
'Token': tokens,
|
22 |
+
'Length': [len(token) for token in tokens]
|
23 |
+
})
|
24 |
+
|
25 |
+
fig = px.scatter(token_df,
|
26 |
+
x=range(len(tokens)),
|
27 |
+
y='Length',
|
28 |
+
hover_data=['Token'],
|
29 |
+
title='Token Lengths in Sequence')
|
30 |
+
|
31 |
+
# Calculate statistics
|
32 |
+
stats = {
|
33 |
+
'Total Tokens': len(tokens),
|
34 |
+
'Unique Tokens': len(set(tokens)),
|
35 |
+
'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
|
36 |
+
'Compression Ratio': len(text) / sum(len(t) for t in tokens)
|
37 |
+
}
|
38 |
+
|
39 |
+
stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
|
40 |
+
for k, v in stats.items())
|
41 |
+
|
42 |
+
return (
|
43 |
+
" ".join(tokens), # Tokenized text
|
44 |
+
fig, # Visualization
|
45 |
+
stats_str # Statistics
|
46 |
+
)
|
47 |
+
|
48 |
+
def decode_tokens(self, tokens_text: str) -> str:
|
49 |
+
"""Decode space-separated tokens back to text"""
|
50 |
+
tokens = tokens_text.split()
|
51 |
+
return self.tokenizer.decode(tokens)
|
52 |
+
|
53 |
+
def create_demo() -> gr.Interface:
|
54 |
+
"""Create Gradio interface"""
|
55 |
+
demo = TokenizerDemo()
|
56 |
+
|
57 |
+
with gr.Blocks(title="Hindi BPE Tokenizer") as interface:
|
58 |
+
gr.Markdown("""
|
59 |
+
# Hindi BPE Tokenizer Demo
|
60 |
+
|
61 |
+
This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
|
62 |
+
Enter Hindi text to see how it gets tokenized and analyze the token distribution.
|
63 |
+
""")
|
64 |
+
|
65 |
+
with gr.Row():
|
66 |
+
with gr.Column():
|
67 |
+
input_text = gr.Textbox(
|
68 |
+
label="Input Hindi Text",
|
69 |
+
placeholder="हिंदी में टेक्स्ट दर्ज करें...",
|
70 |
+
lines=5
|
71 |
+
)
|
72 |
+
tokenize_btn = gr.Button("Tokenize")
|
73 |
+
|
74 |
+
with gr.Column():
|
75 |
+
tokens_output = gr.Textbox(
|
76 |
+
label="Tokenized Output",
|
77 |
+
lines=5
|
78 |
+
)
|
79 |
+
decode_btn = gr.Button("Decode")
|
80 |
+
|
81 |
+
original_output = gr.Textbox(
|
82 |
+
label="Decoded Text",
|
83 |
+
lines=5
|
84 |
+
)
|
85 |
+
|
86 |
+
stats_output = gr.Textbox(
|
87 |
+
label="Tokenization Statistics",
|
88 |
+
lines=4
|
89 |
+
)
|
90 |
+
|
91 |
+
plot_output = gr.Plot(
|
92 |
+
label="Token Length Distribution"
|
93 |
+
)
|
94 |
+
|
95 |
+
# Set up event handlers
|
96 |
+
tokenize_btn.click(
|
97 |
+
fn=demo.tokenize_text,
|
98 |
+
inputs=input_text,
|
99 |
+
outputs=[tokens_output, plot_output, stats_output]
|
100 |
+
)
|
101 |
+
|
102 |
+
decode_btn.click(
|
103 |
+
fn=demo.decode_tokens,
|
104 |
+
inputs=tokens_output,
|
105 |
+
outputs=original_output
|
106 |
+
)
|
107 |
+
|
108 |
+
# Add examples
|
109 |
+
gr.Examples(
|
110 |
+
examples=[
|
111 |
+
["हिंदी भाषा बहुत सुंदर है।"],
|
112 |
+
["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
|
113 |
+
["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
|
114 |
+
],
|
115 |
+
inputs=input_text
|
116 |
+
)
|
117 |
+
|
118 |
+
return interface
|
119 |
+
|
120 |
+
# Create and launch the demo
|
121 |
+
if __name__ == "__main__":
|
122 |
+
demo = create_demo()
|
123 |
+
demo.launch()
|
byte_pair_encoder.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Optional
|
2 |
+
from tqdm import tqdm
|
3 |
+
from collections import Counter
|
4 |
+
from matplotlib import pyplot as plt
|
5 |
+
import json
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
class TrieNode:
|
9 |
+
"""Node in the prefix tree (trie) for fast token matching"""
|
10 |
+
def __init__(self):
|
11 |
+
self.children = {}
|
12 |
+
self.is_token = False
|
13 |
+
self.token = None
|
14 |
+
|
15 |
+
class BytePairEncoder:
|
16 |
+
def __init__(self, text: str):
|
17 |
+
# Initialize vocabulary from characters
|
18 |
+
self.chars = sorted(list(set(text)))
|
19 |
+
self.stoi = {ch: i for i, ch in enumerate(self.chars)}
|
20 |
+
self.itos = {i: ch for i, ch in enumerate(self.chars)}
|
21 |
+
|
22 |
+
# Initial encoding of text
|
23 |
+
self.data = [self.stoi[c] for c in text]
|
24 |
+
|
25 |
+
# Statistics tracking
|
26 |
+
self.stats = {
|
27 |
+
"vocab_sizes": [len(self.chars)],
|
28 |
+
"data_sizes": [len(self.data)],
|
29 |
+
"compression_ratios": [1.0],
|
30 |
+
"merge_counts": [],
|
31 |
+
"tokens_created": [],
|
32 |
+
"max_token_lengths": [1],
|
33 |
+
}
|
34 |
+
|
35 |
+
# Store original length for compression ratio
|
36 |
+
self.original_length = len(self.data)
|
37 |
+
self.max_token_length = 1
|
38 |
+
|
39 |
+
def get_digram_stats(self) -> Counter:
|
40 |
+
"""Get digram counts"""
|
41 |
+
counts = Counter()
|
42 |
+
for pair in zip(self.data, self.data[1:]):
|
43 |
+
pair = (int(pair[0]), int(pair[1]))
|
44 |
+
counts[pair] += 1
|
45 |
+
return counts
|
46 |
+
|
47 |
+
def encode_to_vocab_size(self, target_vocab_size: int, plot_interval: Optional[int] = None,
|
48 |
+
print_interval: int = 100) -> None:
|
49 |
+
"""Train until reaching target vocabulary size"""
|
50 |
+
pbar = tqdm(total=target_vocab_size, desc="Training BPE", initial=len(self.chars))
|
51 |
+
|
52 |
+
iteration = 0
|
53 |
+
while len(self.itos) < target_vocab_size:
|
54 |
+
result = self._merge_step()
|
55 |
+
if result is None:
|
56 |
+
break
|
57 |
+
|
58 |
+
iteration += 1
|
59 |
+
pbar.update(1)
|
60 |
+
|
61 |
+
if print_interval and iteration % print_interval == 0:
|
62 |
+
self._print_progress(iteration)
|
63 |
+
|
64 |
+
if plot_interval and iteration % plot_interval == 0:
|
65 |
+
self.plot_statistics(iteration=iteration)
|
66 |
+
|
67 |
+
pbar.close()
|
68 |
+
|
69 |
+
def _merge_step(self):
|
70 |
+
"""Perform one merge operation"""
|
71 |
+
stats = self.get_digram_stats()
|
72 |
+
if not stats:
|
73 |
+
return None
|
74 |
+
|
75 |
+
top_pair, count = max(stats.items(), key=lambda x: x[1])
|
76 |
+
new_token = self._add_token(top_pair)
|
77 |
+
self.data = self._replace_pairs(top_pair, new_token)
|
78 |
+
self._update_stats(count)
|
79 |
+
|
80 |
+
return new_token, count
|
81 |
+
|
82 |
+
def _add_token(self, pair: tuple) -> int:
|
83 |
+
"""Add new token to vocabulary"""
|
84 |
+
token_str = self.itos[pair[0]] + self.itos[pair[1]]
|
85 |
+
token_id = len(self.itos)
|
86 |
+
self.stoi[token_str] = token_id
|
87 |
+
self.itos[token_id] = token_str
|
88 |
+
self.max_token_length = max(self.max_token_length, len(token_str))
|
89 |
+
return token_id
|
90 |
+
|
91 |
+
def _replace_pairs(self, pair: tuple, new_token: int) -> List[int]:
|
92 |
+
"""Replace all occurrences of pair with new token"""
|
93 |
+
result = []
|
94 |
+
i = 0
|
95 |
+
while i < len(self.data):
|
96 |
+
if i < len(self.data) - 1 and self.data[i] == pair[0] and self.data[i + 1] == pair[1]:
|
97 |
+
result.append(new_token)
|
98 |
+
i += 2
|
99 |
+
else:
|
100 |
+
result.append(self.data[i])
|
101 |
+
i += 1
|
102 |
+
return result
|
103 |
+
|
104 |
+
def _update_stats(self, merge_count: int):
|
105 |
+
"""Update training statistics"""
|
106 |
+
self.stats["vocab_sizes"].append(len(self.itos))
|
107 |
+
self.stats["data_sizes"].append(len(self.data))
|
108 |
+
compression = self.original_length / len(self.data)
|
109 |
+
self.stats["compression_ratios"].append(compression)
|
110 |
+
self.stats["merge_counts"].append(merge_count)
|
111 |
+
self.stats["tokens_created"].append(self.itos[len(self.itos)-1])
|
112 |
+
self.stats["max_token_lengths"].append(self.max_token_length)
|
113 |
+
|
114 |
+
def plot_statistics(self, iteration: Optional[int] = None):
|
115 |
+
"""Plot training statistics"""
|
116 |
+
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
|
117 |
+
|
118 |
+
# Plot training metrics
|
119 |
+
ax1.plot(self.stats["vocab_sizes"], self.stats["data_sizes"])
|
120 |
+
ax1.set_title("Vocabulary vs Dataset Size")
|
121 |
+
|
122 |
+
ax2.plot(self.stats["vocab_sizes"], self.stats["compression_ratios"])
|
123 |
+
ax2.set_title("Compression Ratio Progress")
|
124 |
+
|
125 |
+
if self.stats["merge_counts"]:
|
126 |
+
ax3.hist(self.stats["merge_counts"], bins=30)
|
127 |
+
ax3.set_title("Merge Counts Distribution")
|
128 |
+
|
129 |
+
if self.stats["tokens_created"]:
|
130 |
+
lengths = [len(t) for t in self.stats["tokens_created"]]
|
131 |
+
ax4.plot(range(len(lengths)), lengths)
|
132 |
+
ax4.set_title("Token Length Evolution")
|
133 |
+
|
134 |
+
plt.tight_layout()
|
135 |
+
plt.show()
|
136 |
+
|
137 |
+
def save_to_file(self, filepath: Path):
|
138 |
+
"""Save encoder state"""
|
139 |
+
state = {
|
140 |
+
"chars": self.chars,
|
141 |
+
"stoi": self.stoi,
|
142 |
+
"max_token_length": self.max_token_length,
|
143 |
+
"stats": self.stats
|
144 |
+
}
|
145 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
146 |
+
json.dump(state, f, ensure_ascii=False, indent=2)
|
147 |
+
|
148 |
+
@classmethod
|
149 |
+
def load_from_file(cls, filepath: Path):
|
150 |
+
"""Load encoder state"""
|
151 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
152 |
+
state = json.load(f)
|
153 |
+
|
154 |
+
instance = cls("") # Create empty instance
|
155 |
+
instance.chars = state["chars"]
|
156 |
+
instance.stoi = state["stoi"]
|
157 |
+
instance.itos = {int(i): s for s, i in state["stoi"].items()}
|
158 |
+
instance.max_token_length = state["max_token_length"]
|
159 |
+
instance.stats = state["stats"]
|
160 |
+
|
161 |
+
return instance
|
162 |
+
|
163 |
+
def _print_progress(self, iteration: int):
|
164 |
+
"""Print training progress"""
|
165 |
+
print(f"\nIteration {iteration}:")
|
166 |
+
print(f"Vocabulary size: {len(self.itos):,}")
|
167 |
+
print(f"Data size: {len(self.data):,}")
|
168 |
+
print(f"Compression ratio: {self.stats['compression_ratios'][-1]:.2f}")
|
169 |
+
|
170 |
+
if self.stats["merge_counts"]:
|
171 |
+
last_merge = self.stats["merge_counts"][-1]
|
172 |
+
last_token = self.stats["tokens_created"][-1]
|
173 |
+
print(f"Last merge count: {last_merge:,}")
|
174 |
+
print(f"Last token created: '{last_token}'")
|
175 |
+
|
176 |
+
print(f"Max token length: {self.max_token_length}")
|
177 |
+
|
178 |
+
class TokenizerInternal:
|
179 |
+
"""Tokenizer using trained BPE model"""
|
180 |
+
def __init__(self, encoder: BytePairEncoder):
|
181 |
+
self.stoi = encoder.stoi
|
182 |
+
self.max_token_length = encoder.max_token_length
|
183 |
+
self._trie = self._build_trie()
|
184 |
+
|
185 |
+
def _build_trie(self) -> TrieNode:
|
186 |
+
"""Build trie for efficient tokenization"""
|
187 |
+
root = TrieNode()
|
188 |
+
for token in self.stoi:
|
189 |
+
node = root
|
190 |
+
for char in token:
|
191 |
+
if char not in node.children:
|
192 |
+
node.children[char] = TrieNode()
|
193 |
+
node = node.children[char]
|
194 |
+
node.is_token = True
|
195 |
+
node.token = token
|
196 |
+
return root
|
197 |
+
|
198 |
+
def tokenize(self, text: str) -> List[str]:
|
199 |
+
"""Tokenize text using trie-based matching"""
|
200 |
+
tokens = []
|
201 |
+
pos = 0
|
202 |
+
while pos < len(text):
|
203 |
+
token = self._find_longest_token(text[pos:])
|
204 |
+
tokens.append(token)
|
205 |
+
pos += len(token)
|
206 |
+
return tokens
|
207 |
+
|
208 |
+
def _find_longest_token(self, text: str) -> str:
|
209 |
+
"""Find longest matching token starting at current position"""
|
210 |
+
node = self._trie
|
211 |
+
longest = text[0]
|
212 |
+
current = ""
|
213 |
+
|
214 |
+
for char in text[:self.max_token_length]:
|
215 |
+
if char not in node.children:
|
216 |
+
break
|
217 |
+
current += char
|
218 |
+
node = node.children[char]
|
219 |
+
if node.is_token:
|
220 |
+
longest = node.token
|
221 |
+
|
222 |
+
return longest
|
data_loader.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import List, Dict, Tuple
|
4 |
+
import random
|
5 |
+
|
6 |
+
def load_hindi_dataset(base_path: str = "data", split: str = "train", num_files: int = None) -> str:
|
7 |
+
"""
|
8 |
+
Load Hindi text from dataset with train/validation split structure.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
base_path: Base directory containing train and validation folders
|
12 |
+
split: Either 'train' or 'valid'
|
13 |
+
num_files: Number of files to load (None for all files)
|
14 |
+
"""
|
15 |
+
base_dir = Path(base_path)
|
16 |
+
split_dir = base_dir / split / split
|
17 |
+
|
18 |
+
if not split_dir.exists():
|
19 |
+
raise FileNotFoundError(f"Directory not found: {split_dir}")
|
20 |
+
|
21 |
+
print(f"\nLoading Hindi dataset from {split_dir}")
|
22 |
+
|
23 |
+
# Get all txt files in the directory
|
24 |
+
txt_files = list(split_dir.glob("*.txt"))
|
25 |
+
|
26 |
+
if not txt_files:
|
27 |
+
raise FileNotFoundError(f"No txt files found in {split_dir}")
|
28 |
+
|
29 |
+
# Sort files by word count (assuming filenames contain word counts)
|
30 |
+
txt_files.sort(key=lambda x: int(x.stem))
|
31 |
+
|
32 |
+
# Sample files if num_files is specified
|
33 |
+
if num_files is not None:
|
34 |
+
if num_files < len(txt_files):
|
35 |
+
txt_files = random.sample(txt_files, num_files)
|
36 |
+
|
37 |
+
print(f"Found {len(txt_files)} files")
|
38 |
+
|
39 |
+
# Load and combine text from files
|
40 |
+
texts = []
|
41 |
+
total_chars = 0
|
42 |
+
total_words = 0
|
43 |
+
|
44 |
+
for idx, file_path in enumerate(txt_files, 1):
|
45 |
+
try:
|
46 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
47 |
+
text = f.read().strip()
|
48 |
+
word_count = int(file_path.stem) # Filename is word count
|
49 |
+
texts.append(text)
|
50 |
+
total_chars += len(text)
|
51 |
+
total_words += word_count
|
52 |
+
|
53 |
+
if idx % 10 == 0:
|
54 |
+
print(f"Processed {idx}/{len(txt_files)} files. "
|
55 |
+
f"Total characters: {total_chars:,}, "
|
56 |
+
f"Total words: {total_words:,}")
|
57 |
+
|
58 |
+
except Exception as e:
|
59 |
+
print(f"Error reading file {file_path}: {e}")
|
60 |
+
continue
|
61 |
+
|
62 |
+
combined_text = "\n\n".join(texts)
|
63 |
+
|
64 |
+
print(f"\nDataset loading completed:")
|
65 |
+
print(f"Total files: {len(texts)}")
|
66 |
+
print(f"Total characters: {len(combined_text):,}")
|
67 |
+
print(f"Total words: {total_words:,}")
|
68 |
+
print(f"Average words per file: {total_words/len(texts):,.1f}")
|
69 |
+
|
70 |
+
return combined_text
|
71 |
+
|
72 |
+
def get_dataset_stats(base_path: str = "data") -> Dict:
|
73 |
+
"""Get statistics about the dataset."""
|
74 |
+
stats = {}
|
75 |
+
for split in ['train', 'valid']:
|
76 |
+
split_dir = Path(base_path) / split
|
77 |
+
if split_dir.exists():
|
78 |
+
txt_files = list(split_dir.glob("*.txt"))
|
79 |
+
word_counts = [int(f.stem) for f in txt_files]
|
80 |
+
stats[split] = {
|
81 |
+
'num_files': len(txt_files),
|
82 |
+
'total_words': sum(word_counts),
|
83 |
+
'min_words': min(word_counts) if word_counts else 0,
|
84 |
+
'max_words': max(word_counts) if word_counts else 0,
|
85 |
+
'avg_words': sum(word_counts)/len(word_counts) if word_counts else 0
|
86 |
+
}
|
87 |
+
return stats
|
88 |
+
|
89 |
+
def load_train_valid_split(base_path: str = "data",
|
90 |
+
train_files: int = None,
|
91 |
+
valid_files: int = None) -> Tuple[str, str]:
|
92 |
+
"""Load both train and validation splits."""
|
93 |
+
train_text = load_hindi_dataset(base_path, "train", train_files)
|
94 |
+
valid_text = load_hindi_dataset(base_path, "valid", valid_files)
|
95 |
+
return train_text, valid_text
|
96 |
+
|
97 |
+
if __name__ == "__main__":
|
98 |
+
# Print dataset statistics
|
99 |
+
stats = get_dataset_stats()
|
100 |
+
print("\nDataset Statistics:")
|
101 |
+
print("-" * 50)
|
102 |
+
for split, split_stats in stats.items():
|
103 |
+
print(f"\n{split.upper()} Split:")
|
104 |
+
for key, value in split_stats.items():
|
105 |
+
if isinstance(value, (int, float)):
|
106 |
+
print(f"{key}: {value:,}")
|
107 |
+
else:
|
108 |
+
print(f"{key}: {value}")
|
109 |
+
|
110 |
+
# Load sample data
|
111 |
+
print("\nLoading sample data...")
|
112 |
+
train_text, valid_text = load_train_valid_split(train_files=5, valid_files=2)
|
113 |
+
print(f"\nSample train text (first 200 chars):\n{train_text[:200]}")
|
114 |
+
print(f"\nSample valid text (first 200 chars):\n{valid_text[:200]}")
|
download_dataset.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import kagglehub
|
2 |
+
from pathlib import Path
|
3 |
+
import shutil
|
4 |
+
import pandas as pd
|
5 |
+
import re
|
6 |
+
import nltk
|
7 |
+
from typing import List, Dict
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
def count_hindi_words(text: str) -> int:
|
11 |
+
"""Count words in Hindi text."""
|
12 |
+
words = text.strip().split()
|
13 |
+
hindi_words = [w for w in words if re.search(r'[\u0900-\u097F]', w)]
|
14 |
+
return len(hindi_words)
|
15 |
+
|
16 |
+
def create_dataframe_from_files(downloaded_paths: List[str]) -> pd.DataFrame:
|
17 |
+
"""Create a DataFrame from downloaded text files."""
|
18 |
+
print("\nCreating DataFrame from text files...")
|
19 |
+
|
20 |
+
data = []
|
21 |
+
for file_path in tqdm(downloaded_paths):
|
22 |
+
if file_path.endswith('.txt'):
|
23 |
+
try:
|
24 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
25 |
+
content = f.read().strip()
|
26 |
+
|
27 |
+
# Split into title and text (assuming first line is title)
|
28 |
+
lines = content.split('\n', 1)
|
29 |
+
title = lines[0].strip()
|
30 |
+
text = lines[1].strip() if len(lines) > 1 else ""
|
31 |
+
|
32 |
+
data.append({
|
33 |
+
'title': title,
|
34 |
+
'text': text,
|
35 |
+
'word_count': count_hindi_words(content)
|
36 |
+
})
|
37 |
+
except Exception as e:
|
38 |
+
print(f"Error reading file {file_path}: {e}")
|
39 |
+
continue
|
40 |
+
|
41 |
+
df = pd.DataFrame(data)
|
42 |
+
print(f"Created DataFrame with {len(df)} articles")
|
43 |
+
return df
|
44 |
+
|
45 |
+
def process_and_split_articles(df: pd.DataFrame,
|
46 |
+
output_dir: Path,
|
47 |
+
train_ratio: float = 0.8,
|
48 |
+
min_words: int = 100,
|
49 |
+
max_words: int = 5000) -> Dict[str, int]:
|
50 |
+
"""Process articles and split them into files based on word count."""
|
51 |
+
|
52 |
+
# Create output directories
|
53 |
+
train_dir = output_dir / "train"
|
54 |
+
valid_dir = output_dir / "valid"
|
55 |
+
train_dir.mkdir(exist_ok=True)
|
56 |
+
valid_dir.mkdir(exist_ok=True)
|
57 |
+
|
58 |
+
stats = {'train': 0, 'valid': 0, 'skipped': 0}
|
59 |
+
|
60 |
+
print("\nProcessing articles...")
|
61 |
+
for _, row in tqdm(df.iterrows(), total=len(df)):
|
62 |
+
try:
|
63 |
+
# Skip if too short or too long
|
64 |
+
if row['word_count'] < min_words or row['word_count'] > max_words:
|
65 |
+
stats['skipped'] += 1
|
66 |
+
continue
|
67 |
+
|
68 |
+
# Combine title and text
|
69 |
+
full_text = f"{row['title']}\n\n{row['text']}"
|
70 |
+
|
71 |
+
# Decide split (train or valid)
|
72 |
+
is_train = pd.np.random.random() < train_ratio
|
73 |
+
output_dir = train_dir if is_train else valid_dir
|
74 |
+
|
75 |
+
# Save to file named by word count
|
76 |
+
file_path = output_dir / f"{row['word_count']}.txt"
|
77 |
+
suffix = 1
|
78 |
+
while file_path.exists():
|
79 |
+
file_path = output_dir / f"{row['word_count']}_{suffix}.txt"
|
80 |
+
suffix += 1
|
81 |
+
|
82 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
83 |
+
f.write(full_text)
|
84 |
+
|
85 |
+
if is_train:
|
86 |
+
stats['train'] += 1
|
87 |
+
else:
|
88 |
+
stats['valid'] += 1
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Error processing article: {e}")
|
92 |
+
stats['skipped'] += 1
|
93 |
+
continue
|
94 |
+
|
95 |
+
return stats
|
96 |
+
|
97 |
+
def download_hindi_wikipedia_dataset():
|
98 |
+
"""Download and process Hindi Wikipedia dataset."""
|
99 |
+
print("Starting dataset download...")
|
100 |
+
|
101 |
+
try:
|
102 |
+
# Download the dataset using kagglehub
|
103 |
+
downloaded_paths = kagglehub.dataset_download(
|
104 |
+
"disisbig/hindi-wikipedia-articles-172k"
|
105 |
+
)
|
106 |
+
|
107 |
+
print("Dataset downloaded successfully!")
|
108 |
+
print("Downloaded files:", downloaded_paths)
|
109 |
+
|
110 |
+
# Create data directory
|
111 |
+
data_dir = Path("data")
|
112 |
+
data_dir.mkdir(exist_ok=True)
|
113 |
+
|
114 |
+
# Create DataFrame from downloaded files
|
115 |
+
df = create_dataframe_from_files(downloaded_paths)
|
116 |
+
|
117 |
+
# Save DataFrame for future use
|
118 |
+
df.to_parquet(data_dir / "articles.parquet")
|
119 |
+
print(f"Saved DataFrame to {data_dir / 'articles.parquet'}")
|
120 |
+
|
121 |
+
# Process and split the articles
|
122 |
+
stats = process_and_split_articles(df, data_dir)
|
123 |
+
|
124 |
+
# Print statistics
|
125 |
+
print("\nProcessing completed:")
|
126 |
+
print(f"Train files: {stats['train']}")
|
127 |
+
print(f"Validation files: {stats['valid']}")
|
128 |
+
print(f"Skipped articles: {stats['skipped']}")
|
129 |
+
|
130 |
+
# Get file sizes
|
131 |
+
train_size = sum(f.stat().st_size for f in (data_dir / "train").glob("*.txt"))
|
132 |
+
valid_size = sum(f.stat().st_size for f in (data_dir / "valid").glob("*.txt"))
|
133 |
+
|
134 |
+
print(f"\nTotal size:")
|
135 |
+
print(f"Train: {train_size / (1024*1024):.2f} MB")
|
136 |
+
print(f"Validation: {valid_size / (1024*1024):.2f} MB")
|
137 |
+
|
138 |
+
return True
|
139 |
+
|
140 |
+
except Exception as e:
|
141 |
+
print(f"Error downloading/processing dataset: {e}")
|
142 |
+
return False
|
143 |
+
|
144 |
+
def verify_dataset_structure():
|
145 |
+
"""Verify the dataset directory structure and files."""
|
146 |
+
data_dir = Path("data")
|
147 |
+
|
148 |
+
if not data_dir.exists():
|
149 |
+
print("Error: Data directory not found!")
|
150 |
+
return False
|
151 |
+
|
152 |
+
# Check if we have the processed DataFrame
|
153 |
+
parquet_file = data_dir / "articles.parquet"
|
154 |
+
if parquet_file.exists():
|
155 |
+
df = pd.read_parquet(parquet_file)
|
156 |
+
print(f"\nArticles DataFrame:")
|
157 |
+
print(f"Total articles: {len(df)}")
|
158 |
+
# print(f"Word count range: {df['word_count'].min()} - {df['word_count'].max()}")
|
159 |
+
|
160 |
+
for split in ['train', 'valid']:
|
161 |
+
split_dir = data_dir / split
|
162 |
+
if not split_dir.exists():
|
163 |
+
print(f"Error: {split} directory not found!")
|
164 |
+
return False
|
165 |
+
|
166 |
+
txt_files = list(split_dir.glob("*.txt"))
|
167 |
+
if not txt_files:
|
168 |
+
print(f"Error: No text files found in {split} directory!")
|
169 |
+
return False
|
170 |
+
|
171 |
+
print(f"\n{split.upper()} split:")
|
172 |
+
print(f"Number of files: {len(txt_files)}")
|
173 |
+
word_counts = [int(f.stem.split('_')[0]) for f in txt_files]
|
174 |
+
print(f"Word count range: {min(word_counts)} - {max(word_counts)}")
|
175 |
+
|
176 |
+
return True
|
177 |
+
|
178 |
+
if __name__ == "__main__":
|
179 |
+
# Download and process the dataset
|
180 |
+
success = download_hindi_wikipedia_dataset()
|
181 |
+
|
182 |
+
if success:
|
183 |
+
print("\nVerifying dataset structure...")
|
184 |
+
verify_dataset_structure()
|
hindi_bpe.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from collections import Counter
|
3 |
+
from typing import Dict, List, Tuple, Set
|
4 |
+
import unicodedata
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.express as px
|
7 |
+
import plotly.graph_objects as go
|
8 |
+
from tqdm import tqdm
|
9 |
+
import json
|
10 |
+
from matplotlib import pyplot as plt
|
11 |
+
from pathlib import Path
|
12 |
+
from byte_pair_encoder import BytePairEncoder, TokenizerInternal
|
13 |
+
|
14 |
+
class HindiBPE:
|
15 |
+
def __init__(self, vocab_size: int = 5000):
|
16 |
+
print(f"\nInitializing HindiBPE with max vocab size: {vocab_size}")
|
17 |
+
self.vocab_size = vocab_size
|
18 |
+
self.encoder = None
|
19 |
+
|
20 |
+
def train(self, text: str) -> None:
|
21 |
+
"""Train BPE on Hindi text."""
|
22 |
+
print("\nInitializing BytePairEncoder...")
|
23 |
+
self.encoder = BytePairEncoder(text)
|
24 |
+
|
25 |
+
print("\nTraining BPE...")
|
26 |
+
self.encoder.encode_to_vocab_size(
|
27 |
+
target_vocab_size=self.vocab_size,
|
28 |
+
plot_interval=1000,
|
29 |
+
print_interval=100
|
30 |
+
)
|
31 |
+
|
32 |
+
# Plot final statistics
|
33 |
+
self.encoder.plot_statistics()
|
34 |
+
|
35 |
+
# Save the trained model
|
36 |
+
self.save_tokenizer()
|
37 |
+
|
38 |
+
def encode(self, text: str) -> List[str]:
|
39 |
+
"""Encode Hindi text using trained tokenizer."""
|
40 |
+
if self.encoder is None:
|
41 |
+
raise ValueError("Tokenizer not trained yet!")
|
42 |
+
|
43 |
+
print("\nTokenizing text...")
|
44 |
+
tokenizer = TokenizerInternal(self.encoder)
|
45 |
+
tokens = list(tokenizer.tokenize(text))
|
46 |
+
|
47 |
+
compression = self.calculate_compression_ratio(text, tokens)
|
48 |
+
print(f"\nEncoding completed:")
|
49 |
+
print(f"Token count: {len(tokens)}")
|
50 |
+
print(f"Unique tokens: {len(set(tokens))}")
|
51 |
+
print(f"Compression ratio: {compression:.2f}")
|
52 |
+
|
53 |
+
return tokens
|
54 |
+
|
55 |
+
def decode(self, tokens: List[str]) -> str:
|
56 |
+
"""Decode tokens back to text."""
|
57 |
+
if self.encoder is None:
|
58 |
+
raise ValueError("Tokenizer not trained yet!")
|
59 |
+
|
60 |
+
print("\nDecoding tokens...")
|
61 |
+
decoded = "".join(tokens)
|
62 |
+
print(f"Decoded length: {len(decoded)} characters")
|
63 |
+
return decoded
|
64 |
+
|
65 |
+
def save_tokenizer(self, path: str = "tokenizer") -> None:
|
66 |
+
"""Save the tokenizer to disk."""
|
67 |
+
save_dir = Path(path)
|
68 |
+
save_dir.mkdir(exist_ok=True)
|
69 |
+
|
70 |
+
# Save the encoder
|
71 |
+
self.encoder.save_to_file(save_dir / "encoder.json")
|
72 |
+
|
73 |
+
# Save vocabulary stats
|
74 |
+
stats = self.get_token_statistics()
|
75 |
+
with open(save_dir / "vocab_stats.json", "w") as f:
|
76 |
+
json.dump(stats, f, indent=2)
|
77 |
+
|
78 |
+
print(f"Tokenizer saved to {save_dir}")
|
79 |
+
|
80 |
+
@classmethod
|
81 |
+
def load_tokenizer(cls, path: str = "tokenizer") -> "HindiBPE":
|
82 |
+
"""Load a trained tokenizer from disk."""
|
83 |
+
load_dir = Path(path)
|
84 |
+
if not load_dir.exists():
|
85 |
+
raise FileNotFoundError(f"Tokenizer directory not found: {load_dir}")
|
86 |
+
|
87 |
+
# Create instance
|
88 |
+
instance = cls()
|
89 |
+
|
90 |
+
# Load encoder
|
91 |
+
instance.encoder = BytePairEncoder.load_from_file(load_dir / "encoder.json")
|
92 |
+
|
93 |
+
print(f"Loaded tokenizer from {load_dir}")
|
94 |
+
print(f"Vocabulary size: {len(instance.encoder.itos)}")
|
95 |
+
return instance
|
96 |
+
|
97 |
+
def get_token_statistics(self) -> Dict:
|
98 |
+
"""Get statistics about the learned tokens."""
|
99 |
+
if self.encoder is None:
|
100 |
+
raise ValueError("Tokenizer not trained yet!")
|
101 |
+
|
102 |
+
token_lengths = [len(token) for token in self.encoder.itos.values()]
|
103 |
+
return {
|
104 |
+
'vocab_size': len(self.encoder.itos),
|
105 |
+
'avg_token_length': sum(token_lengths) / len(token_lengths),
|
106 |
+
'min_token_length': min(token_lengths),
|
107 |
+
'max_token_length': max(token_lengths),
|
108 |
+
'length_distribution': Counter(token_lengths),
|
109 |
+
'training_stats': self.encoder.stats
|
110 |
+
}
|
111 |
+
|
112 |
+
def calculate_compression_ratio(self, text: str, tokens: List[str]) -> float:
|
113 |
+
"""Calculate compression ratio."""
|
114 |
+
original_size = len(text)
|
115 |
+
encoded_size = sum(len(token) for token in tokens)
|
116 |
+
return original_size / encoded_size
|
117 |
+
|
118 |
+
def preprocess_hindi_text(text: str) -> str:
|
119 |
+
"""Preprocess Hindi text for better BPE training."""
|
120 |
+
# Remove excessive whitespace
|
121 |
+
text = re.sub(r'\s+', ' ', text.strip())
|
122 |
+
|
123 |
+
# Normalize Unicode characters
|
124 |
+
text = unicodedata.normalize('NFKC', text)
|
125 |
+
|
126 |
+
# Remove unnecessary punctuation (keep essential ones)
|
127 |
+
text = re.sub(r'[^\u0900-\u097F\s।]', '', text)
|
128 |
+
|
129 |
+
return text
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.23.5
|
2 |
+
pandas==1.5.3
|
3 |
+
plotly==5.13.0
|
4 |
+
kagglehub
|
5 |
+
streamlit
|
6 |
+
beautifulsoup4
|
7 |
+
huggingface-hub>=0.19.0
|
8 |
+
tqdm
|
9 |
+
matplotlib
|
10 |
+
gitpython>=3.1.0
|
11 |
+
gradio>=4.0.0
|
test_hindi_bpe.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from pathlib import Path
|
3 |
+
from hindi_bpe import HindiBPE, preprocess_hindi_text
|
4 |
+
from data_loader import load_hindi_dataset
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.express as px
|
7 |
+
from typing import List, Dict
|
8 |
+
import time
|
9 |
+
|
10 |
+
class HindiBPETest:
|
11 |
+
def __init__(self, vocab_size: int = 4500, num_articles: int = 1000):
|
12 |
+
self.vocab_size = vocab_size
|
13 |
+
self.num_articles = num_articles
|
14 |
+
self.bpe = None
|
15 |
+
self.text = None
|
16 |
+
self.encoded = None
|
17 |
+
self.stats = {}
|
18 |
+
|
19 |
+
def load_data(self) -> str:
|
20 |
+
"""Load and preprocess the dataset."""
|
21 |
+
print("\nStep 1: Loading dataset...")
|
22 |
+
start_time = time.time()
|
23 |
+
|
24 |
+
# Load train split
|
25 |
+
self.text = load_hindi_dataset(
|
26 |
+
split="train",
|
27 |
+
num_files=self.num_articles
|
28 |
+
)
|
29 |
+
self.text = preprocess_hindi_text(self.text)
|
30 |
+
|
31 |
+
# Get validation text for testing
|
32 |
+
self.valid_text = load_hindi_dataset(
|
33 |
+
split="valid",
|
34 |
+
num_files=min(self.num_articles // 5, 100) # 20% of train size or max 100
|
35 |
+
)
|
36 |
+
|
37 |
+
self.stats['load_time'] = time.time() - start_time
|
38 |
+
self.stats['original_length'] = len(self.text)
|
39 |
+
self.stats['valid_length'] = len(self.valid_text)
|
40 |
+
print(f"Loading completed in {self.stats['load_time']:.2f} seconds")
|
41 |
+
return self.text
|
42 |
+
|
43 |
+
def train_tokenizer(self) -> HindiBPE:
|
44 |
+
"""Train the BPE tokenizer."""
|
45 |
+
print("\nStep 2: Training BPE tokenizer...")
|
46 |
+
start_time = time.time()
|
47 |
+
|
48 |
+
self.bpe = HindiBPE(vocab_size=self.vocab_size)
|
49 |
+
self.bpe.train(self.text)
|
50 |
+
|
51 |
+
self.stats['train_time'] = time.time() - start_time
|
52 |
+
self.stats['vocab_size'] = len(self.bpe.vocab)
|
53 |
+
print(f"Training completed in {self.stats['train_time']:.2f} seconds")
|
54 |
+
return self.bpe
|
55 |
+
|
56 |
+
def encode_text(self) -> List[str]:
|
57 |
+
"""Encode the text using trained tokenizer."""
|
58 |
+
print("\nStep 3: Encoding text...")
|
59 |
+
start_time = time.time()
|
60 |
+
|
61 |
+
self.encoded = self.bpe.encode(self.text)
|
62 |
+
|
63 |
+
self.stats['encode_time'] = time.time() - start_time
|
64 |
+
self.stats['encoded_length'] = sum(len(token) for token in self.encoded)
|
65 |
+
self.stats['compression_ratio'] = self.stats['original_length'] / self.stats['encoded_length']
|
66 |
+
print(f"Encoding completed in {self.stats['encode_time']:.2f} seconds")
|
67 |
+
return self.encoded
|
68 |
+
|
69 |
+
def save_visualizations(self, output_dir: str = "output"):
|
70 |
+
"""Generate and save visualizations."""
|
71 |
+
print("\nStep 4: Generating visualizations...")
|
72 |
+
output_dir = Path(output_dir)
|
73 |
+
output_dir.mkdir(exist_ok=True)
|
74 |
+
|
75 |
+
# Token length distribution
|
76 |
+
token_lengths = [len(token) for token in self.bpe.vocab]
|
77 |
+
df = pd.DataFrame({'Length': token_lengths})
|
78 |
+
fig = px.histogram(df, x='Length',
|
79 |
+
title='Token Length Distribution',
|
80 |
+
labels={'Length': 'Token Length', 'count': 'Frequency'})
|
81 |
+
fig.write_html(output_dir / "token_distribution.html")
|
82 |
+
|
83 |
+
# Compression visualization
|
84 |
+
comp_df = pd.DataFrame({
|
85 |
+
'Stage': ['Original', 'Encoded'],
|
86 |
+
'Size': [self.stats['original_length'], self.stats['encoded_length']]
|
87 |
+
})
|
88 |
+
fig = px.bar(comp_df, x='Stage', y='Size',
|
89 |
+
title='Text Compression Comparison')
|
90 |
+
fig.write_html(output_dir / "compression.html")
|
91 |
+
|
92 |
+
# Save statistics to CSV
|
93 |
+
pd.DataFrame([self.stats]).to_csv(output_dir / "stats.csv")
|
94 |
+
print(f"Visualizations saved to {output_dir}")
|
95 |
+
|
96 |
+
def print_summary(self):
|
97 |
+
"""Print summary of the tokenization process."""
|
98 |
+
print("\nTokenization Summary:")
|
99 |
+
print("-" * 50)
|
100 |
+
print(f"Dataset size: {self.stats['original_length']:,} characters")
|
101 |
+
print(f"Vocabulary size: {self.stats['vocab_size']:,} tokens")
|
102 |
+
print(f"Compression ratio: {self.stats['compression_ratio']:.2f}")
|
103 |
+
print(f"\nProcessing times:")
|
104 |
+
print(f"Loading: {self.stats['load_time']:.2f} seconds")
|
105 |
+
print(f"Training: {self.stats['train_time']:.2f} seconds")
|
106 |
+
print(f"Encoding: {self.stats['encode_time']:.2f} seconds")
|
107 |
+
|
108 |
+
def run_full_pipeline(self) -> Dict:
|
109 |
+
"""Run the complete tokenization pipeline."""
|
110 |
+
self.load_data()
|
111 |
+
self.train_tokenizer()
|
112 |
+
self.encode_text()
|
113 |
+
self.save_visualizations()
|
114 |
+
self.print_summary()
|
115 |
+
return self.stats
|
116 |
+
|
117 |
+
def main():
|
118 |
+
# Example usage
|
119 |
+
test = HindiBPETest(vocab_size=4500, num_articles=1000)
|
120 |
+
stats = test.run_full_pipeline()
|
121 |
+
|
122 |
+
# Test tokenization on a sample text
|
123 |
+
sample_text = """
|
124 |
+
भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।
|
125 |
+
हिंदी भारत की प्रमुख भाषाओं में से एक है।
|
126 |
+
"""
|
127 |
+
|
128 |
+
print("\nTesting tokenization on sample text:")
|
129 |
+
tokens = test.bpe.encode(sample_text)
|
130 |
+
print(f"Original text: {sample_text}")
|
131 |
+
print(f"Tokens: {tokens}")
|
132 |
+
decoded = test.bpe.decode(tokens)
|
133 |
+
print(f"Decoded text: {decoded}")
|
134 |
+
|
135 |
+
# Verify compression ratio requirement
|
136 |
+
if stats['compression_ratio'] >= 3.2:
|
137 |
+
print("\nSuccess: Achieved required compression ratio ≥ 3.2")
|
138 |
+
else:
|
139 |
+
print("\nWarning: Compression ratio below target 3.2")
|
140 |
+
|
141 |
+
# Verify vocabulary size requirement
|
142 |
+
if stats['vocab_size'] < 5000:
|
143 |
+
print("Success: Vocabulary size within limit < 5000")
|
144 |
+
else:
|
145 |
+
print("Warning: Vocabulary size exceeds limit")
|
146 |
+
|
147 |
+
if __name__ == "__main__":
|
148 |
+
main()
|
tokenizer/encoder.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d559243e553c94c34426987f1f4411554cc5ece15bce3f30d53668ec876f39dc
|
3 |
+
size 540153
|
tokenizer/vocab_stats.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f30c70f6c7adee48d16f6796cf43b321b89f3927efffd1d16c6cff19b343f74d
|
3 |
+
size 474148
|
tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f062eed13569bd463ce96289411c435a2f328ea07a106946dbe9274489a1bc6
|
3 |
+
size 359
|