You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
What kind of pre-tokenizer are you saving ? If some building blocks are missing we could add them to make the thing more composable/portable/shareable.
Hi @Narsil , k-mer tokenization is used in many applications in bioinformatics. Right now I am doing the following to define my tokenizer, save and load my model, which I now know is not ideal. I wondered if there is a way to use serializable building blocks to save/load the tokenizer as any other HF tokenizer. Thank you
from itertools import product
import torch
from torchtext.data.utils import get_tokenizer
from tokenizers import Tokenizer,PreTokenizedString, NormalizedString
from tokenizers.pre_tokenizers import PreTokenizer, Whitespace
from tokenizers.models import WordLevel
from typing import List, Tuple
#Define the pre-tokenizer steps (Just split the string in chunks of size k)
class KmerPreTokenizer:
def __init__(self, k: int, stride=None):
self.k = k
self.stride = k if not stride else stride
def split(self, i: int, normalized: NormalizedString) -> List[Tuple[str, Tuple[int, int]]]:
seq = normalized.original
splits = [normalized[i:i + self.k] for i in range(0, len(seq) - self.k + 1, self.stride)]
return splits
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.split)
class KmerDecoder:
def decode(self, tokens: List[str]) -> str:
return "".join(tokens)
# Build the vocabulary
k = 4
good_kmers = []
bad_kmers = []
kmers = [''.join(kmer) for kmer in product('ACGTN',repeat=k)]
for kmer in kmers:
if "N" in kmer:
bad_kmers.append(kmer)
else:
good_kmers.append(kmer)
kmers = good_kmers + bad_kmers
vocab=dict((word, i) for i,word in enumerate(kmers))
#Use the Vocab and the pre-tokenizer to get a customized k-mer tokenizer
Create a WordLevel model from the vocabulary list
tok = Tokenizer(WordLevel(vocab=vocab, unk_token="[UNK]"))
tok.pre_tokenizer = PreTokenizer.custom(KmerPreTokenizer(k))
#tok.decoder = Decoder.custom(KmerDecoder())
# Optional: Train the tokenizer (if you want to add more tokens or further refine it)
# trainer = WordLevelTrainer(special_tokens=["<MASK>", "<CLS>", "<UNK>"])
# tokenizer.train_from_iterator(kmer_iter, trainer)
# Save or use the tokenizer
# tokenizer.save("path/to/tokenizer.json")
input = "ACGCGCGCGTGGAGCGCGATCGACTTT"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
#Save the tokenizer
from transformers import PreTrainedTokenizerFast
tok.pre_tokenizer = Whitespace()
new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tok)
#new_tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(KmerPreTokenizer(k))
# Save the fast tokenizer
new_tokenizer.save_pretrained("tokenizers")
#Load the tokenizer
from transformers import AutoTokenizer
# Load the tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained("tokenizers")
loaded_tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(KmerPreTokenizer(k))
# Test the loaded tokenizer
input_text = "ACGCGCGCGTGGAGCGCGATCGACNTTTT"
print(loaded_tokenizer.tokenize(input_text))
print(loaded_tokenizer(input_text))
Hi @Narsil , k-mer tokenization is used in many applications in bioinformatics. Right now I am doing the following to define my tokenizer, save and load my model, which I now know is not ideal. I wondered if there is a way to use serializable building blocks to save/load the tokenizer as any other HF tokenizer. Thank you
Originally posted by @millanp95 in #581 (comment)
The text was updated successfully, but these errors were encountered: