Ukrainian tokenizer support

This commit is contained in:
Dominik Macháček 2023-06-05 17:29:26 +02:00
parent 260b1f8f88
commit 2625be10b7
2 changed files with 21 additions and 10 deletions

View file

@ -4,7 +4,7 @@ import numpy as np
import librosa import librosa
from functools import lru_cache from functools import lru_cache
import time import time
from mosestokenizer import MosesTokenizer
@lru_cache @lru_cache
@ -207,14 +207,12 @@ class OnlineASRProcessor:
SAMPLING_RATE = 16000 SAMPLING_RATE = 16000
def __init__(self, language, asr): def __init__(self, asr, tokenizer):
"""language: lang. code that MosesTokenizer uses for sentence segmentation """asr: WhisperASR object
asr: WhisperASR object tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
chunk: number of seconds for intended size of audio interval that is inserted and looped
""" """
self.language = language
self.asr = asr self.asr = asr
self.tokenizer = MosesTokenizer(self.language) self.tokenizer = tokenizer
self.init() self.init()
@ -369,7 +367,7 @@ class OnlineASRProcessor:
self.last_chunked_at = time self.last_chunked_at = time
def words_to_sentences(self, words): def words_to_sentences(self, words):
"""Uses mosestokenizer for sentence segmentation of words. """Uses self.tokenizer for sentence segmentation of words.
Returns: [(beg,end,"sentence 1"),...] Returns: [(beg,end,"sentence 1"),...]
""" """
@ -419,6 +417,15 @@ class OnlineASRProcessor:
return (b,e,t) return (b,e,t)
def create_tokenizer(lan):
if lan == "uk":
import tokenize_uk
class UkrainianTokenizer:
def split(self, text):
return tokenize_uk.tokenize_sents(text)
return UkrainianTokenizer()
from mosestokenizer import MosesTokenizer
return MosesTokenizer(lan)
## main: ## main:
@ -482,8 +489,9 @@ if __name__ == "__main__":
print("setting VAD filter",file=sys.stderr) print("setting VAD filter",file=sys.stderr)
asr.use_vad() asr.use_vad()
min_chunk = args.min_chunk_size min_chunk = args.min_chunk_size
online = OnlineASRProcessor(tgt_language,asr) online = OnlineASRProcessor(asr,create_tokenizer(tgt_language))
# load the audio into the LRU cache before we start the timer # load the audio into the LRU cache before we start the timer

View file

@ -48,6 +48,9 @@ asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, mode
if args.task == "translate": if args.task == "translate":
asr.set_translate_task() asr.set_translate_task()
tgt_language = "en"
else:
tgt_language = language
e = time.time() e = time.time()
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr) print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
@ -58,7 +61,7 @@ if args.vad:
min_chunk = args.min_chunk_size min_chunk = args.min_chunk_size
online = OnlineASRProcessor(language,asr) online = OnlineASRProcessor(asr,create_tokenizer(tgt_language))