ligeti commited on
Commit
7e1c1a4
·
verified ·
1 Parent(s): 503d528

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -48
README.md CHANGED
@@ -20,20 +20,15 @@ tokenization_parameters = {
20
  'kmer': 6,
21
  'shift': 1
22
  }
23
-
24
  # Initialize the tokenizer and model
25
  tokenizer = ProkBERTTokenizer(tokenization_params=tokenization_parameters, operation_space='sequence')
26
  model = MegatronBertForMaskedLM.from_pretrained("nerualbioinfo/prokbert-mini-k6s2")
27
-
28
  # Example DNA sequence
29
  sequence = 'ATGTCCGCGGGACCT'
30
-
31
  # Tokenize the sequence
32
  inputs = tokenizer(sequence, return_tensors="pt")
33
-
34
  # Ensure that inputs have a batch dimension
35
  inputs = {key: value.unsqueeze(0) for key, value in inputs.items()}
36
-
37
  # Generate outputs from the model
38
  outputs = model(**inputs)
39
  ```
@@ -91,48 +86,6 @@ After segmentation, sequences are encoded into a vector format. The LCA method a
91
  4. **Create a Padded/Truncated Array**: Generate a uniform array structure, padding or truncating as necessary.
92
  5. **Save the Array to HDF**: Store the processed data in an HDF (Hierarchical Data Format) file for efficient retrieval and use in training models.
93
 
94
- ```python
95
- import pkg_resources
96
- from os.path import join
97
- from prokbert.sequtils import *
98
-
99
- # Directory for pretraining FASTA files
100
- pretraining_fasta_files_dir = pkg_resources.resource_filename('prokbert','data/pretraining')
101
-
102
- # Define segmentation and tokenization parameters
103
- segmentation_params = {
104
- 'max_length': 256, # Split the sequence into segments of length L
105
- 'min_length': 6,
106
- 'type': 'random'
107
- }
108
- tokenization_parameters = {
109
- 'kmer': 6,
110
- 'shift': 1,
111
- 'max_segment_length': 2003,
112
- 'token_limit': 2000
113
- }
114
-
115
- # Setup configuration
116
- defconfig = SeqConfig()
117
- segmentation_params = defconfig.get_and_set_segmentation_parameters(segmentation_params)
118
- tokenization_params = defconfig.get_and_set_tokenization_parameters(tokenization_parameters)
119
-
120
- # Load and segment sequences
121
- input_fasta_files = [join(pretraining_fasta_files_dir, file) for file in get_non_empty_files(pretraining_fasta_files_dir)]
122
- sequences = load_contigs(input_fasta_files, IsAddHeader=True, adding_reverse_complement=True, AsDataFrame=True, to_uppercase=True, is_add_sequence_id=True)
123
- segment_db = segment_sequences(sequences, segmentation_params, AsDataFrame=True)
124
-
125
- # Tokenization
126
- tokenized = batch_tokenize_segments_with_ids(segment_db, tokenization_params)
127
- expected_max_token = max(len(arr) for arrays in tokenized.values() for arr in arrays)
128
- X, torchdb = get_rectangular_array_from_tokenized_dataset(tokenized, tokenization_params['shift'], expected_max_token)
129
-
130
- # Save to HDF file
131
- hdf_file = '/tmp/pretraining.h5'
132
- save_to_hdf(X, hdf_file, database=torchdb, compression=True)
133
- ```
134
-
135
-
136
 
137
  ### Installation of ProkBERT (if needed)
138
 
@@ -177,8 +130,10 @@ Please report any issues with the model or its outputs to the Neural Bioinformat
177
  - **Feedback and inquiries:** [[email protected]](mailto:[email protected])
178
 
179
  ## Reference
180
- ```
181
  If you use ProkBERT-mini in your research, please cite the following paper:
 
 
 
182
  @ARTICLE{10.3389/fmicb.2023.1331233,
183
  AUTHOR={Ligeti, Balázs and Szepesi-Nagy, István and Bodnár, Babett and Ligeti-Nagy, Noémi and Juhász, János},
184
  TITLE={ProkBERT family: genomic language models for microbiome applications},
 
20
  'kmer': 6,
21
  'shift': 1
22
  }
 
23
  # Initialize the tokenizer and model
24
  tokenizer = ProkBERTTokenizer(tokenization_params=tokenization_parameters, operation_space='sequence')
25
  model = MegatronBertForMaskedLM.from_pretrained("nerualbioinfo/prokbert-mini-k6s2")
 
26
  # Example DNA sequence
27
  sequence = 'ATGTCCGCGGGACCT'
 
28
  # Tokenize the sequence
29
  inputs = tokenizer(sequence, return_tensors="pt")
 
30
  # Ensure that inputs have a batch dimension
31
  inputs = {key: value.unsqueeze(0) for key, value in inputs.items()}
 
32
  # Generate outputs from the model
33
  outputs = model(**inputs)
34
  ```
 
86
  4. **Create a Padded/Truncated Array**: Generate a uniform array structure, padding or truncating as necessary.
87
  5. **Save the Array to HDF**: Store the processed data in an HDF (Hierarchical Data Format) file for efficient retrieval and use in training models.
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  ### Installation of ProkBERT (if needed)
91
 
 
130
  - **Feedback and inquiries:** [[email protected]](mailto:[email protected])
131
 
132
  ## Reference
 
133
  If you use ProkBERT-mini in your research, please cite the following paper:
134
+
135
+
136
+ ```
137
  @ARTICLE{10.3389/fmicb.2023.1331233,
138
  AUTHOR={Ligeti, Balázs and Szepesi-Nagy, István and Bodnár, Babett and Ligeti-Nagy, Noémi and Juhász, János},
139
  TITLE={ProkBERT family: genomic language models for microbiome applications},