diff --git a/3rd_party/marian-dev b/3rd_party/marian-dev index e8a1a2530..3daf4ee29 160000 --- a/3rd_party/marian-dev +++ b/3rd_party/marian-dev @@ -1 +1 @@ -Subproject commit e8a1a2530fb84cbff7383302ebca393e5875c441 +Subproject commit 3daf4ee2906583dfc86e4f6986b40f843e7e3a3c diff --git a/Ftt.def b/Ftt.def new file mode 100644 index 000000000..104883cee --- /dev/null +++ b/Ftt.def @@ -0,0 +1,156 @@ +Bootstrap: docker +From: condaforge/mambaforge:22.11.1-4 +Stage: spython-base + +%files +pipeline/setup/install-deps.sh install-deps.sh +envs/base.yml /conda-envs/c7aefb385f47824bd83e494ba6175afb/environment.yaml +envs/bicleaner-ai-lumi.yml /conda-envs/6dc32b6f0731acf817ada219622a98b8/environment.yaml +envs/bicleaner-ai.yml /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/environment.yaml +envs/bicleaner.yml /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/environment.yaml +envs/corpus.yml /conda-envs/2e8e4401e9abbca04941f823d00fe74a/environment.yaml +envs/tensorboard.yml /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221/environment.yaml +%labels +io.github.snakemake.containerized="true" +io.github.snakemake.conda_env_hash="41592307ee99833c1ad2068c1e915ff9c38acc418b5bebfe7e107d9a79980cb4" +%post + +# Remove this if not in Finland, or change to closer mirror +cat /etc/apt/sources.list | sed "s/archive.ubuntu.com/mirrors.nic.funet.fi/g" > temp && mv temp /etc/apt/sources.list + +apt-get update && apt-get -y install gcc g++ curl + +export DEBIAN_FRONTEND=noninteractive + +bash install-deps.sh + +# Step 1: Retrieve conda environments + +# Conda environment: +# source: envs/base.yml +# prefix: /conda-envs/c7aefb385f47824bd83e494ba6175afb +# name: bergamot-training +# channels: +# - conda-forge +# - defaults +# dependencies: +# - python=3.9 +# - cmake=3.21.1 +# - pip=21.2.2 +# - pip: +# - sacrebleu==2.0.0 +# - mtdata==0.4.0 +# - fasttext==0.9.2 +# - regex==2019.8.19 +# - sacremoses==0.0.43 +mkdir -p /conda-envs/c7aefb385f47824bd83e494ba6175afb + +# Conda environment: +# source: envs/bicleaner-ai-lumi.yml +# prefix: /conda-envs/6dc32b6f0731acf817ada219622a98b8 +# name: bicleaner-ai +# channels: +# - conda-forge +# - defaults +# dependencies: +# - python=3.9 +# - pip==21.2.2 +# - cmake=3.21.1 +# - pip: +# - bicleaner-ai==2.2.1 +# - tensorflow-rocm==2.10.0.520 +mkdir -p /conda-envs/6dc32b6f0731acf817ada219622a98b8 + +# Conda environment: +# source: envs/bicleaner-ai.yml +# prefix: /conda-envs/04b8248cb528961ad452c73dd0a7c8b6 +# name: bicleaner-ai +# channels: +# - conda-forge +# - defaults +# dependencies: +# - python=3.9 +# - pip==21.2.2 +# - cmake=3.21.1 +# - pip: +# - bicleaner-ai==2.2.1 +# - tensorflow==2.6.5 +mkdir -p /conda-envs/04b8248cb528961ad452c73dd0a7c8b6 + +# Conda environment: +# source: envs/bicleaner.yml +# prefix: /conda-envs/a4f700aa6ff0256dcd9321b536a081ac +# name: bicleaner +# channels: +# - conda-forge +# - bitextor +# - defaults +# dependencies: +# - python=3.8 +# - pip==23.0 +# - cmake=3.21.1 +# - hunspell==1.7.0 +# - pip: +# - pypi-kenlm +# - bicleaner==0.16.1 +mkdir -p /conda-envs/a4f700aa6ff0256dcd9321b536a081ac + +# Conda environment: +# source: envs/corpus.yml +# prefix: /conda-envs/2e8e4401e9abbca04941f823d00fe74a +# name: corpus +# channels: +# - conda-forge +# - defaults +# dependencies: +# - python=3.9 +# - pip=21.2.2 +# - pip: +# - sacrebleu==2.0.0 +# - mtdata==0.3.2 +# - requests==2.26.0 +mkdir -p /conda-envs/2e8e4401e9abbca04941f823d00fe74a + +# Conda environment: +# source: envs/tensorboard.yml +# prefix: /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221 +# name: tensorboard +# channels: +# - conda-forge +# - defaults +# dependencies: +# - python=3.9 +# - cmake=3.21.1 +# - pip=21.2.2 +# - pip: +# - tensorboard==2.5.0 +# - tensorboardX==2.2 +# - click==8.0.1 +# - toolz==0.11.1 +mkdir -p /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221 + +# Step 2: Generate conda environments + +mamba env create --prefix /conda-envs/c7aefb385f47824bd83e494ba6175afb --file /conda-envs/c7aefb385f47824bd83e494ba6175afb/environment.yaml && \ +mamba env create --prefix /conda-envs/6dc32b6f0731acf817ada219622a98b8 --file /conda-envs/6dc32b6f0731acf817ada219622a98b8/environment.yaml && \ +mamba env create --prefix /conda-envs/04b8248cb528961ad452c73dd0a7c8b6 --file /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/environment.yaml && \ +mamba env create --prefix /conda-envs/a4f700aa6ff0256dcd9321b536a081ac --file /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/environment.yaml && \ +mamba env create --prefix /conda-envs/2e8e4401e9abbca04941f823d00fe74a --file /conda-envs/2e8e4401e9abbca04941f823d00fe74a/environment.yaml && \ +mamba env create --prefix /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221 --file /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221/environment.yaml && \ +mamba clean --all -y + +#Bicleaner needs the fasttext language id model installed +wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin +cp lid.176.bin /conda-envs/6dc32b6f0731acf817ada219622a98b8/lib/python3.9/site-packages/fastspell/lid.176.bin +cp lid.176.bin /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/lib/python3.8/site-packages/fastspell/lid.176.bin +cp lid.176.bin /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/lib/python3.9/site-packages/fastspell/lid.176.bin + +#Fastspell (used in bicleaner) uses hunspell to disambiguate between similar languages, install all hunspell dictionaries for that +wget -O fastspell_dictionaries.tgz https://github.com/mbanon/fastspell/releases/download/dictionaries_v1/fastspell_dictionaries.tgz +mkdir -p /usr/share/hunspell +tar -xf fastspell_dictionaries.tgz --directory /usr/share/hunspell + +%runscript +exec /bin/bash "$@" +%startscript +exec /bin/bash "$@" diff --git a/InstallSnakemakeEnvs b/InstallSnakemakeEnvs new file mode 100644 index 000000000..0df216be9 --- /dev/null +++ b/InstallSnakemakeEnvs @@ -0,0 +1,14 @@ +from os import listdir +def get_envs(wildcards): + return [x.replace(".yml",".done") for x in os.listdir("envs") if x.endswith(".yml")] + +container: 'Ftt.sif' + +rule all: + input: get_envs + +rule make_envs: + conda: 'envs/{env}.yml' + output: '{env}.done' + shell: f'touch {{output}}' + diff --git a/Makefile b/Makefile index 98e69b3cd..029ef2ad3 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ PROFILE?=local # execution rule or path to rule output, default is all TARGET= REPORTS?=../reports +EXTRA= # for tensorboard MODELS?=../models @@ -30,9 +31,24 @@ conda: snakemake: $(CONDA_ACTIVATE) base - mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.12.2 tabulate==0.8.10 --yes + mamba create -c conda-forge -c bioconda -n snakemake snakemake==7.19.1 tabulate==0.8.10 --yes mkdir -p "$(SNAKEMAKE_OUTPUT_CACHE)" + +containerize: + $(CONDA_ACTIVATE) snakemake + $(SNAKEMAKE) \ + --profile=profiles/$(PROFILE) \ + --configfile $(CONFIG) \ + --containerize > Dockerfile + spython recipe Dockerfile Ftt.def + sed -i "s|%files|%files\npipeline/setup/install-deps.sh install-deps.sh|" Ftt.def + sed -i 's#%post#%post\ncat /etc/apt/sources.list | sed "s/archive.ubuntu.com/mirrors.nic.funet.fi/g" > temp \&\& mv temp /etc/apt/sources.list \ + \napt-get update \&\& apt-get -y install gcc g++ \ + \nexport DEBIAN_FRONTEND=noninteractive \ + \nbash install-deps.sh#' Ftt.def + apptainer build Ftt.sif Ftt.def + # build container image for cluster and run-local modes (preferred) build: sudo singularity build Singularity.sif Singularity.def @@ -53,7 +69,18 @@ dry-run: --profile=profiles/$(PROFILE) \ --configfile $(CONFIG) \ -n \ - $(TARGET) + $(TARGET) \ + $(EXTRA) \ + +dry-run-hpc: + echo "Dry run with config $(CONFIG) and profile $(PROFILE)" + $(SNAKEMAKE) \ + --profile=profiles/$(PROFILE) \ + --configfile $(CONFIG) \ + -n \ + --conda-base-path=../bin \ + $(TARGET) \ + $(EXTRA) test-dry-run: CONFIG=configs/config.test.yml test-dry-run: dry-run @@ -67,7 +94,18 @@ run: $(SNAKEMAKE) \ --profile=profiles/$(PROFILE) \ --configfile $(CONFIG) \ - $(TARGET) + $(TARGET) \ + $(EXTRA) + +run-hpc: + echo "Running with config $(CONFIG) and profile $(PROFILE)" + chmod +x profiles/$(PROFILE)/* + $(SNAKEMAKE) \ + --profile=profiles/$(PROFILE) \ + --configfile $(CONFIG) \ + --conda-base-path=../bin \ + $(TARGET) \ + $(EXTRA) test: CONFIG=configs/config.test.yml test: run diff --git a/Snakefile b/Snakefile index 9eed44086..03cae35f1 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,6 @@ import yaml import os +import glob from snakemake.utils import min_version from pipeline.bicleaner import packs @@ -13,48 +14,72 @@ min_version("6.6.1") ### configuration -container: 'Singularity.sif' +containerized: config.get('image', 'Singularity.sif') install_deps = config['deps'] == 'true' -data_root_dir = config['root'] -cuda_dir = config['cuda'] -cudnn_dir = config['cudnn'] +data_root_dir = config.get('root', srcdir("../data")) +cuda_dir = config.get('cuda', os.environ.get("CUDA_INSTALL_ROOT","")) +cudnn_dir = config.get('cudnn', os.environ.get("CUDNN_INSTALL_ROOT","")) +rocm_dir = config.get('rocm',os.environ.get("ROCM_PATH","")) + gpus_num = config['numgpus'] # marian occupies all GPUs on a machine if `gpus` are not specified gpus = config['gpus'] if config['gpus'] else ' '.join([str(n) for n in range(int(gpus_num))]) workspace = config['workspace'] marian_cmake = config['mariancmake'] +marian_version = config.get('marianversion','marian-dev') # experiment src = config['experiment']['src'] trg = config['experiment']['trg'] +src_three_letter = config['experiment'].get('src_three_letter') +trg_three_letter = config['experiment'].get('trg_three_letter') experiment = config['experiment']['name'] -mono_max_sent_src = config['experiment']['mono-max-sentences-src'] -mono_max_sent_trg = config['experiment']['mono-max-sentences-trg'] -bicl_default_threshold = config['experiment']['bicleaner']['default-threshold'] -bicl_dataset_thresholds = config['experiment']['bicleaner']['dataset-thresholds'] -backward_pretrained = config['experiment']['backward-model'] -vocab_pretrained = config['experiment']['vocab'] +mono_max_sent_src = config['experiment'].get('mono-max-sentences-src') +mono_max_sent_trg = config['experiment'].get('mono-max-sentences-trg') +parallel_max_sents = config['experiment'].get('parallel-max-sentences',"inf") + + + +backward_pretrained = config['experiment'].get('backward-model') +backward_pretrained_vocab = config['experiment'].get('backward-vocab') +vocab_pretrained = config['experiment'].get('vocab') +forward_pretrained = config['experiment'].get('forward-model') experiment_dir=f"{data_root_dir}/experiments/{src}-{trg}/{experiment}" # override marian cofings marian_args = {name: ' '.join([f'--{k} {v}' for k,v in conf.items() ]) - for name, conf in config['marian-args'].items()} + for name, conf in config.get('marian-args',{}).items()} + +# There can be multiple opus teachers, but a single teacher can also be provided +# as string, so convert it to list here +opusmt_teacher = config['experiment'].get('opusmt-teacher') +if opusmt_teacher and not isinstance(opusmt_teacher,list): + opusmt_teacher = [opusmt_teacher] + +opusmt_backward = config['experiment'].get('opusmt-backward') + +# if no target language token specified, use src (they might be different in rare cases) +target_language_token = config['experiment'].get('target-language-token',trg) + +#this is for reverse scoring with multilingual model +source_language_token = config['experiment'].get('target-language-token',src) # datasets train_datasets = config['datasets']['train'] valid_datasets = config['datasets']['devtest'] eval_datasets = config['datasets']['test'] -mono_src_datasets = config['datasets']['mono-src'] -mono_trg_datasets = config['datasets']['mono-trg'] +mono_src_datasets = config['datasets'].get('mono-src') +mono_trg_datasets = config['datasets'].get('mono-trg') mono_datasets = {src: mono_src_datasets, trg: mono_trg_datasets} mono_max_sent = {src: mono_max_sent_src, trg: mono_max_sent_trg} # parallelization -ensemble = list(range(config['experiment']['teacher-ensemble'])) +ensemble = list(range(config['experiment'].get('teacher-ensemble',0))) + split_length = config['experiment']['split-length'] # logging @@ -64,14 +89,19 @@ reports_dir = f"{data_root_dir}/reports/{src}-{trg}/{experiment}" # binaries cwd = os.getcwd() third_party_dir = f'{cwd}/3rd_party' -marian_dir = f'{third_party_dir}/marian-dev/build' + +if marian_version == 'lumi-marian': + marian_dir = f'{third_party_dir}/lumi-marian/build/' +else: + marian_dir = f'{third_party_dir}/marian-dev/build/' + bmt_marian_dir = f'{third_party_dir}/browsermt-marian-dev/build' -trainer = f'{marian_dir}/marian' -decoder = f'{marian_dir}/marian-decoder' -scorer = f'{marian_dir}/marian-scorer' -spm_encoder = f'{marian_dir}/spm_encode' -spm_trainer = f'{marian_dir}/spm_train' -spm_exporter = f'{marian_dir}/spm_export_vocab' +trainer = f'{marian_dir}marian' +decoder = f'{marian_dir}marian-decoder' +scorer = f'{marian_dir}marian-scorer' +spm_encoder = f'{marian_dir}spm_encode' +spm_trainer = f'{marian_dir}spm_train' +spm_exporter = f'{marian_dir}spm_export_vocab' bmt_decoder = f'{bmt_marian_dir}/marian-decoder' bmt_converter = f'{bmt_marian_dir}/marian-conv' @@ -105,9 +135,23 @@ exported_dir = f"{models_dir}/exported" best_model_metric = config['experiment']['best-model'] best_model = f"final.model.npz.best-{best_model_metric}.npz" backward_dir = f'{models_dir}/backward' -spm_sample_size=config['experiment']['spm-sample-size'] +spm_sample_size=config['experiment'].get('spm-sample-size') spm_vocab_size=config['experiment'].get('spm-vocab-size',"32000") -vocab_path=vocab_pretrained or f"{models_dir}/vocab/vocab.spm" + +#forward pretrained models are trained with sentencepiece integration, the value is a path to the directory +if forward_pretrained: + teacher_base_dir = forward_pretrained + #this means that the when the model dirs are expanded, the result is only the teacher_base_dir + ensemble = [""] + + +#default vocab path used with base ftt +vocab_path = vocab_pretrained or f"{models_dir}/vocab/vocab.spm" + +if opusmt_backward: + backward_vocab = f"{backward_dir}/vocab.yml" +else: + backward_vocab = vocab_path #evaluation eval_data_dir = f"{original}/eval" @@ -120,7 +164,7 @@ eval_teacher_ens_dir = f'{eval_res_dir}/teacher-ensemble' # set common environment variables envs = f'''SRC={src} TRG={trg} MARIAN="{marian_dir}" BMT_MARIAN="{bmt_marian_dir}" GPUS="{gpus}" WORKSPACE={workspace} \ -BIN="{bin}" CUDA_DIR="{cuda_dir}" CUDNN_DIR="{cudnn_dir}" ''' +BIN="{bin}" CUDA_DIR="{cuda_dir}" CUDNN_DIR="{cudnn_dir}" ROCM_PATH="{rocm_dir}" COMPRESSION_CMD=pigz ARTIFACT_EXT=gz''' # CUDA_VISIBLE_DEVICES is used by bicleaner ai. slurm sets this variable # it can be overriden manually by 'gpus' config setting to split GPUs in local mode if config['gpus']: @@ -128,15 +172,19 @@ if config['gpus']: ### workflow options -results = [f'{exported_dir}/model.{src}{trg}.intgemm.alphas.bin.gz', - f'{exported_dir}/lex.50.50.{src}{trg}.s2t.bin.gz', - f'{exported_dir}/vocab.{src}{trg}.spm.gz', - f'{experiment_dir}/config.yml', - *expand(f'{eval_res_dir}/teacher-base{{ens}}/{{dataset}}.metrics',ens=ensemble, dataset=eval_datasets), - *expand(f'{eval_student_dir}/{{dataset}}.metrics', dataset=eval_datasets), - *expand(f'{eval_student_finetuned_dir}/{{dataset}}.metrics', dataset=eval_datasets), - *expand(f'{eval_speed_dir}/{{dataset}}.metrics', dataset=eval_datasets) - ] +results = [ + f'{exported_dir}/model.{src}{trg}.intgemm.alphas.bin.gz', + f'{exported_dir}/lex.50.50.{src}{trg}.s2t.bin.gz', + f'{exported_dir}/vocab.{src}{trg}.spm.gz', + f'{experiment_dir}/config.yml', + *expand(f'{eval_student_dir}/{{dataset}}.metrics',dataset=eval_datasets), + *expand(f'{eval_student_finetuned_dir}/{{dataset}}.metrics',dataset=eval_datasets), + *expand(f'{eval_speed_dir}/{{dataset}}.metrics',dataset=eval_datasets) + ] + +#don't evaluate opus mt teachers or pretrained teachers (TODO: fix sp issues with opusmt teacher evaluation) +if not (opusmt_teacher or forward_pretrained): + results.extend(expand(f'{eval_res_dir}/teacher-base0-{{ens}}/{{dataset}}.metrics',ens=ensemble, dataset=eval_datasets)) if len(ensemble) > 1: results.extend(expand(f'{eval_teacher_ens_dir}/{{dataset}}.metrics', dataset=eval_datasets)) @@ -144,18 +192,35 @@ if len(ensemble) > 1: if install_deps: results.append("/tmp/flags/setup.done") -if not backward_pretrained: +#three options for backward model: pretrained path, url to opus-mt, or train backward +if backward_pretrained: + do_train_backward = False + backward_dir = backward_pretrained +elif opusmt_backward: + do_train_backward = False +else: # don't evaluate pretrained model results.extend(expand(f'{eval_backward_dir}/{{dataset}}.metrics',dataset=eval_datasets)) do_train_backward=True -else: - do_train_backward = False - backward_dir = backward_pretrained # bicleaner -bicleaner_type = packs.find(src, trg) -bicleaner_env = "envs/bicleaner-ai.yml" if bicleaner_type == 'bicleaner-ai' else 'envs/bicleaner.yml' + +if 'bicleaner' in config['experiment']: + bicl_default_threshold = config['experiment']['bicleaner']['default-threshold'] + bicl_dataset_thresholds = config['experiment']['bicleaner']['dataset-thresholds'] + + bicleaner_type = packs.find(src, trg) +else: + bicleaner_type = None + +if bicleaner_type == 'bicleaner-ai': + if marian_version == 'lumi-marian': + bicleaner_env = 'envs/bicleaner-ai-lumi.yml' + else: + bicleaner_env = 'envs/bicleaner-ai.yml' +else: + bicleaner_env = 'envs/bicleaner.yml' if bicleaner_type: clean_corpus_prefix = f'{biclean}/corpus' @@ -172,11 +237,11 @@ clean_corpus_trg = f'{clean_corpus_prefix}.{trg}.gz' # augmentation -if mono_trg_datasets: +if mono_trg_datasets and not (opusmt_teacher or forward_pretrained): teacher_corpus = f'{augmented}/corpus' augment_corpus = True final_teacher_dir = teacher_finetuned_dir - results.extend(expand(f'{eval_res_dir}/teacher-finetuned{{ens}}/{{dataset}}.metrics',ens=ensemble, dataset=eval_datasets)) + results.extend(expand(f'{eval_res_dir}/teacher-finetuned0-{{ens}}/{{dataset}}.metrics',ens=ensemble, dataset=eval_datasets)) else: augment_corpus = False final_teacher_dir = teacher_base_dir @@ -194,7 +259,6 @@ def dataset_norm(name: str): def get_args(section): return marian_args.get(section) or "" - ### rules shell.prefix(f"{envs} ") @@ -243,8 +307,8 @@ rule marian: spm_trainer=protected(f'{third_party_dir}/{{marian_type}}/build/spm_train'), spm_encoder=protected(f'{third_party_dir}/{{marian_type}}/build/spm_encode'), spm_exporter=protected(f'{third_party_dir}/{{marian_type}}/build/spm_export_vocab') - params: build_dir=f'{third_party_dir}/{{marian_type}}/build' - shell: 'bash pipeline/setup/compile-marian.sh {params.build_dir} {threads} {marian_cmake} >> {log} 2>&1' + params: build_dir=f'{third_party_dir}/{{marian_type}}/build',marian_type=f'{{marian_type}}' + shell: 'bash pipeline/setup/compile-{params.marian_type}.sh {params.build_dir} {threads} {marian_cmake} >> {log} 2>&1' rule fast_align: message: "Compiling fast align" @@ -274,6 +338,19 @@ rule extract_lex: shell: 'bash pipeline/setup/compile-extract-lex.sh {extract_lex_build} {threads} >> {log} 2>&1' # data downloading +# TODO: Tatoeba data has dev, test and train in same big tar, make a rule producing them all, +# and use snakemake ruleorder to prioritize it over this +ruleorder: download_tatoeba_corpus > download_corpus + +rule download_tatoeba_corpus: + message: "Downloading Tatoeba corpus" + log: f"{log_dir}/download_corpus/corpus_devset_test/tc_{{version}}.log" + conda: "envs/base.yml" + threads: 1 +# group: 'data' + output: multiext(f"{original}/corpus/tc_{{version}}", f".{src}.gz", f".{trg}.gz"),multiext(f"{original}/devset/tc_{{version}}", f".{src}.gz", f".{trg}.gz"),multiext(f"{original}/eval/tc_{{version}}", f".{src}.gz", f".{trg}.gz") + params: prefix=f"{original}", version="{version}",max_sents=parallel_max_sents + shell: 'bash pipeline/data/download-tc-data.sh {src_three_letter} {trg_three_letter} {src} {trg} {params.prefix} {params.version} {params.max_sents} >> {log} 2>&1' rule download_corpus: message: "Downloading parallel corpus" @@ -356,7 +433,7 @@ if use_bicleaner: conda: bicleaner_env # group: "bicleaner" threads: gpus_num * 2 if bicleaner_type == "bicleaner-ai" else workflow.cores - resources: gpu=gpus_num if bicleaner_type == "bicleaner-ai" else 0 + resources: gpu=gpus_num if bicleaner_type == "bicleaner-ai" else 0, mem_mb=128000 input: ancient(rules.kenlm.output), multiext(f"{clean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz"), pack_dir=rules.bicleaner_pack.output output: multiext(f"{biclean}/corpus/{{dataset}}", f".{src}.gz", f".{trg}.gz") @@ -369,6 +446,7 @@ if use_bicleaner: "{params.prefix_input}" "{params.prefix_output}" {params.threshold} {bicleaner_type} {threads} \ "{input.pack_dir}" >> {log} 2>&1''' + rule merge_corpus: message: "Merging clean parallel datasets" log: f"{log_dir}/merge_corpus.log" @@ -378,8 +456,10 @@ rule merge_corpus: input: expand(f"{clean_corpus_prefix}/{{dataset}}.{{lang}}.gz", dataset=train_datasets, lang=[src, trg]), bin=ancient(deduper) output: src=clean_corpus_src,trg=clean_corpus_trg - params: prefix_output=clean_corpus_prefix, prefixes=expand(f"{clean_corpus_prefix}/{{dataset}}", dataset=train_datasets) - shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" {params.prefixes} >> {log} 2>&1''' + params: prefix_output=clean_corpus_prefix, + prefixes=expand(f"{clean_corpus_prefix}/{{dataset}}", dataset=train_datasets), + max_sents=parallel_max_sents + shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" {params.max_sents} {params.prefixes} >> {log} 2>&1''' rule merge_devset: message: "Merging devsets" @@ -391,7 +471,7 @@ rule merge_devset: bin=ancient(deduper) output: multiext(f"{original}/devset", f".{src}.gz", f".{trg}.gz") params: prefix_output=f"{original}/devset", prefixes=expand(f"{original}/devset/{{dataset}}", dataset=valid_datasets) - shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" {params.prefixes} >> {log} 2>&1''' + shell: '''bash pipeline/clean/merge-corpus.sh "{params.prefix_output}" inf {params.prefixes} >> {log} 2>&1''' rule merge_mono: message: "Merging clean monolingual datasets" @@ -419,9 +499,12 @@ if not vocab_pretrained: output: vocab_path params: prefix_train=clean_corpus_prefix,prefix_test=f"{original}/devset" shell: '''bash pipeline/train/spm-vocab.sh "{input.corpus_src}" "{input.corpus_trg}" "{output}" {spm_sample_size} \ - {threads} {spm_vocab_size} >> {log} 2>&1''' + {threads} {spm_vocab_size} >> {log} 2>&1''' if do_train_backward: + mono_trg_file = f'{translated}/mono_trg/file.{{part}}' + deseg_mono_trg_outfile = f'{mono_trg_file}.out' + rule train_backward: message: "Training backward model" log: f"{log_dir}/train_backward.log" @@ -439,6 +522,19 @@ if do_train_backward: backward train {trg} {src} "{params.prefix_train}" "{params.prefix_test}" "{backward_dir}" \ "{input.vocab}" "{best_model_metric}" {params.args} >> {log} 2>&1''' +elif opusmt_backward: + mono_trg_file = f'{translated}/mono_trg/file.{{part}}.{{model_index}}.opusmt' + deseg_mono_trg_outfile = f'{mono_trg_file}.out.deseg' + + rule download_opusmt_backward: + message: "Downloading OPUS-MT backward model" + log: f"{log_dir}/download_backward.log" + conda: "envs/base.yml" + output: model=f'{backward_dir}/{best_model}',vocab=f'{backward_dir}/vocab.yml', model_dir=directory({backward_dir}) + shell: '''bash pipeline/opusmt/download-model.sh \ + "{opusmt_backward}" "{backward_dir}" "{best_model}" {trg_three_letter} {src_three_letter} >> {log} 2>&1''' + + if augment_corpus: checkpoint split_mono_trg: message: "Splitting monolingual trg dataset" @@ -449,6 +545,8 @@ if augment_corpus: output: directory(f'{translated}/mono_trg') shell: 'bash pipeline/translate/split-mono.sh {input.corpora} {output} {split_length} >> {log} 2>&1' + #TODO: make it possible to use multiple backward models, add filtering for backtranslations + #TODO: add preprocessing and deseg for OPUS-MT backward model backtranslation, currently works only with trained backward model rule translate_mono_trg: message: "Translating monolingual trg dataset with backward model" log: f"{log_dir}/translate_mono_trg/{{part}}.log" @@ -456,11 +554,11 @@ if augment_corpus: threads: gpus_num * 2 resources: gpu=gpus_num input: - bin=ancient(decoder), file=f'{translated}/mono_trg/file.{{part}}', + bin=ancient(decoder), file=mono_trg_file, vocab=vocab_path, model=f'{backward_dir}/{best_model}' - output: f'{translated}/mono_trg/file.{{part}}.out' + output: file=f'{mono_trg_file}.out' params: args = get_args("decoding-backward") - shell: '''bash pipeline/translate/translate.sh "{input.file}" "{input.vocab}" {input.model} {params.args} \ + shell: '''bash pipeline/translate/translate.sh "{input.file}" "{output.file}" "{input.vocab}" {input.model} {params.args} \ >> {log} 2>&1''' rule collect_mono_trg: @@ -470,11 +568,11 @@ if augment_corpus: threads: 4 #group 'mono_trg' input: - lambda wildcards: expand(f"{translated}/mono_trg/file.{{part}}.out", + lambda wildcards: expand(deseg_mono_trg_outfile, part=find_parts(wildcards, checkpoints.split_mono_trg)) output: f'{translated}/mono.{src}.gz' params: src_mono=f"{clean}/mono.{trg}.gz",dir=directory(f'{translated}/mono_trg') - shell: 'bash pipeline/translate/collect.sh "{params.dir}" "{output}" "{params.src_mono}" >> {log} 2>&1' + shell: 'bash pipeline/translate/collect.sh "{params.dir}" "{output}" "{params.src_mono}" "" >> {log} 2>&1' rule merge_augmented: message: "Merging augmented dataset" @@ -483,44 +581,66 @@ if augment_corpus: threads: 4 #group 'mono_trg' input: - src1=clean_corpus_src,src2=rules.collect_mono_trg.output, - trg1=clean_corpus_trg,trg2=rules.split_mono_trg.input, + src1=clean_corpus_src, + src2=rules.collect_mono_trg.output, + trg1=clean_corpus_trg, + trg2=rules.split_mono_trg.input.corpora, bin=ancient(deduper) output: res_src=f'{augmented}/corpus.{src}.gz',res_trg=f'{augmented}/corpus.{trg}.gz' shell: '''bash pipeline/translate/merge-corpus.sh \ - "{input.src1}" "{input.src2}" "{input.trg1}" "{input.trg2}" "{output.res_src}" "{output.res_trg}" \ + "{input.src1}" "{input.src2}" "{input.trg1}" "{input.trg2}" "{output.res_src}" "{output.res_trg}" "" \ >> {log} 2>&1''' -rule train_teacher: - message: "Training teacher on all data" - log: f"{log_dir}/train_teacher{{ens}}.log" - conda: "envs/base.yml" - threads: gpus_num*2 - resources: gpu=gpus_num - input: - rules.merge_devset.output, train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz', - bin=ancient(trainer), vocab=vocab_path - output: model=f'{teacher_base_dir}{{ens}}/{best_model}' - params: prefix_train=teacher_corpus, prefix_test=f"{original}/devset", dir=directory(f'{teacher_base_dir}{{ens}}'), - args=get_args("training-teacher-base") - shell: '''bash pipeline/train/train.sh \ - teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \ - "{input.vocab}" "{best_model_metric}" {params.args} >> {log} 2>&1''' +# Three options for teacher: 1. download opus-mt model, 2. train teacher with pipeline, 3. path to pretrained teacher model +# TODO: make it possible to combine any of the above options, i.e. use opus-mt, train and use +# pretrained all in the same run. Probably should have a model list where you can define all the +# models to use, and then prefixes (opusmt_, train_, pretrained_, nllb_ etc.) determine how the models are +# created/used/connected to (in case of e.g. external APIs). +if 'opusmt-teacher' in config['experiment']: + rule download_teacher_model: + message: "Downloading OPUS-MT teacher model" + log: f"{log_dir}/download_teacher{{model_index}}-{{ens}}.log" + conda: "envs/base.yml" + threads: 1 + output: model=f'{teacher_base_dir}{{model_index}}-{{ens}}/{best_model}',vocab=f'{teacher_base_dir}{{model_index}}-{{ens}}/vocab.yml', model_dir=directory(f'{teacher_base_dir}{{model_index}}-{{ens}}') + params: teacher_dir=f'{teacher_base_dir}{{model_index}}-{{ens}}', + teacher_url=lambda wildcards: opusmt_teacher[int(wildcards.model_index)] + shell: '''bash pipeline/opusmt/download-model.sh \ + "{params.teacher_url}" "{params.teacher_dir}" "{best_model}" {src_three_letter} {trg_three_letter} >> {log} 2>&1''' +elif not forward_pretrained: + rule train_teacher: + message: "Training teacher on all data" + log: f"{log_dir}/train_teacher{{model_index}}-{{ens}}.log" + conda: "envs/base.yml" + threads: gpus_num*2 + resources: gpu=gpus_num + input: + rules.merge_devset.output, train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz', + bin=ancient(trainer), vocab=vocab_path + output: model=f'{teacher_base_dir}{{model_index}}-{{ens}}/{best_model}' + params: prefix_train=teacher_corpus, + prefix_test=f"{original}/devset", + dir=directory(f'{teacher_base_dir}{{model_index}}-{{ens}}'), + args=get_args("training-teacher-base") + shell: '''bash pipeline/train/train.sh \ + teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \ + "{input.vocab}" "{best_model_metric}" {params.args} >> {log} 2>&1''' + if augment_corpus: rule finetune_teacher: message: "Finetune teacher on parallel corpus" - log: f"{log_dir}/finetune_teacher{{ens}}.log" + log: f"{log_dir}/finetune_teacher0-{{ens}}.log" conda: "envs/base.yml" threads: gpus_num * 2 resources: gpu=gpus_num input: - rules.merge_devset.output, model=f'{teacher_base_dir}{{ens}}/{best_model}', + rules.merge_devset.output, model=f'{teacher_base_dir}0-{{ens}}/{best_model}', train_src=clean_corpus_src, train_trg=clean_corpus_trg, bin=ancient(trainer), vocab=vocab_path - output: model=f'{teacher_finetuned_dir}{{ens}}/{best_model}' + output: model=f'{teacher_finetuned_dir}0-{{ens}}/{best_model}' params: prefix_train=clean_corpus_prefix, prefix_test=f"{original}/devset", - dir=directory(f'{teacher_finetuned_dir}{{ens}}'), + dir=directory(f'{teacher_finetuned_dir}0-{{ens}}'), args=get_args("training-teacher-finetuned") shell: '''bash pipeline/train/train.sh \ teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \ @@ -528,8 +648,6 @@ if augment_corpus: ### translation with teacher -# corpus - checkpoint split_corpus: message: "Splitting the corpus to translate" log: f"{log_dir}/split_corpus.log" @@ -540,44 +658,102 @@ checkpoint split_corpus: shell: '''bash pipeline/translate/split-corpus.sh \ {input.corpus_src} {input.corpus_trg} {output} {split_length} >> {log} 2>&1''' +if opusmt_teacher: + teacher_source_file = f'{translated}/corpus/file.{{part}}.{{model_index}}.opusmt' + teacher_target_file = f'{translated}/corpus/file.{{part}}.{{model_index}}.opusmt.nbest' + teacher_mono_source_file = f'{translated}/mono_src/file.{{part}}.{{model_index}}.opusmt' + teacher_mono_target_file = f'{translated}/mono_src/file.{{part}}.{{model_index}}.opusmt.out' + translated_mono_src_extension = "opusmt.out" + deseg_nbest_file = f'{teacher_target_file}.deseg' + + rule opusmt_deseg_translation: + message: "Desegmenting OPUS-MT model translation" + log: f"{log_dir}/opusmt_deseg_mono_translation/{{part}}.{{model_index}}.log" + threads: 1 + wildcard_constraints: + model_index="\d+" + input: f'{translated}/mono_src/file.{{part}}.{{model_index}}.opusmt.out' + output: f'{translated}/mono_src/file.{{part}}.{{model_index}}.out' + run: + with open(input[0], "rt", encoding="utf8") as infile,open(output[0], "wt", encoding="utf8") as outfile: + for line in infile: + deseg_line = line.replace(" ","").replace("▁"," ") + outfile.write(deseg_line) + + #This is an optional rule that only applies when OPUS-MT model is used as teacher. + #Required due to OPUS-MT models not using the integrated SentencePiece in Marian + rule opusmt_preprocess_corpus: + message: "Preprocessing source file for OPUS-MT model" + log: f"{log_dir}/opusmt_preprocess_corpus/{{corpus}}.{{part}}.{{model_index}}.log" + conda: "envs/base.yml" + threads: 1 + input: + file=f'{translated}/{{corpus}}/file.{{part}}', + teacher_model=f"{final_teacher_dir}{{model_index}}-0/{best_model}", + spm_encoder=ancient(spm_encoder) + output: f'{translated}/{{corpus}}/file.{{part}}.{{model_index}}.opusmt' + shell: '''bash pipeline/translate/opusmt-preprocess.sh \ + {input.file} {input.teacher_model} src "source.spm" {input.spm_encoder} {target_language_token} {wildcards.model_index} >> {log} 2>&1''' + rule opusmt_deseg_nbest: + message: "Desegmenting OPUS-MT model nbest list" + log: f"{log_dir}/opusmt_deseg_nbest/{{part}}.{{model_index}}.log" + threads: 1 + input: nbest=f"{teacher_source_file}.nbest" + output: temp(deseg_nbest_file) + run: + with open(input[0], "rt", encoding="utf8") as infile,open(output[0], "wt", encoding="utf8") as outfile: + for line in infile: + line_split = line.split(" ||| ") + line_split[1] = line_split[1].replace(" ","").replace("▁"," ") + outfile.write(" ||| ".join(line_split)) +else: + teacher_source_file = f'{translated}/corpus/file.{{part}}' + teacher_target_file = f'{translated}/corpus/file.{{part}}.{{model_index}}.nbest' + teacher_mono_source_file = f'{translated}/mono_src/file.{{part}}' + teacher_mono_target_file = f'{translated}/mono_src/file.{{part}}.{{model_index}}.out' + translated_mono_src_extension = ".out" + deseg_nbest_file = teacher_target_file + + + rule translate_corpus: message: "Translating corpus with teacher" - log: f"{log_dir}/translate_corpus/{{part}}.log" + log: f"{log_dir}/translate_corpus/{{part}}.{{model_index}}.log" conda: "envs/base.yml" threads: gpus_num*2 resources: gpu=gpus_num input: ancient(decoder), - file=f'{translated}/corpus/file.{{part}}', + file=teacher_source_file, vocab=vocab_path, - teacher_models=expand(f"{final_teacher_dir}{{ens}}/{best_model}",ens=ensemble) - output: f'{translated}/corpus/file.{{part}}.nbest' + teacher_models=expand(f"{final_teacher_dir}{{{{model_index}}}}-{{ens}}/{best_model}",ens=ensemble) + output: file=teacher_target_file params: args=get_args('decoding-teacher') shell: '''bash pipeline/translate/translate-nbest.sh \ - "{input.file}" "{input.vocab}" {input.teacher_models} {params.args} >> {log} 2>&1''' + "{input.file}" "{output.file}" "{input.vocab}" {input.teacher_models} {params.args} >> {log} 2>&1''' rule extract_best: message: "Extracting best translations for the corpus" - log: f"{log_dir}/extract_best/{{part}}.log" + log: f"{log_dir}/extract_best/{{part}}.{{model_index}}.log" conda: "envs/base.yml" threads: 1 #group 'translate_corpus' - input: nbest=f"{translated}/corpus/file.{{part}}.nbest", ref=f"{translated}/corpus/file.{{part}}.ref" - output: f"{translated}/corpus/file.{{part}}.nbest.out" + input: nbest=deseg_nbest_file, ref=f"{translated}/corpus/file.{{part}}.ref" + output: f"{translated}/corpus/file.{{part}}.nbest.{{model_index}}.out" shell: 'python pipeline/translate/bestbleu.py -i {input.nbest} -r {input.ref} -m bleu -o {output} >> {log} 2>&1' +model_indices = list(range(len(opusmt_teacher))) if opusmt_teacher else [0] + rule collect_corpus: message: "Collecting translated corpus" - log: f"{log_dir}/collect_corpus.log" + log: f"{log_dir}/collect_corpus_{{model_index}}.log" conda: "envs/base.yml" threads: 4 #group 'translate_corpus' - input: - lambda wildcards: expand(f"{translated}/corpus/file.{{part}}.nbest.out", - part=find_parts(wildcards, checkpoints.split_corpus)) - output: f'{translated}/corpus.{trg}.gz' + input: lambda wildcards: expand(f"{translated}/corpus/file.{{part}}.nbest.{wildcards.model_index}.out", part=find_parts(wildcards, checkpoints.split_corpus)) + output: trg_corpus=f'{translated}/corpus.{{model_index}}.{trg}.gz' params: src_corpus=clean_corpus_src - shell: 'bash pipeline/translate/collect.sh {translated}/corpus {output} {params.src_corpus} >> {log} 2>&1' + shell: 'bash pipeline/translate/collect.sh {translated}/corpus {output} {params.src_corpus} {wildcards.model_index} >> {log} 2>&1' # mono @@ -592,31 +768,58 @@ checkpoint split_mono_src: rule translate_mono_src: message: "Translating monolingual src dataset with teacher" - log: f"{log_dir}/translate_mono_src/{{part}}.log" + log: f"{log_dir}/translate_mono_src/{{part}}.{{model_index}}.log" conda: "envs/base.yml" threads: gpus_num*2 + wildcard_constraints: + model_index="\d+" resources: gpu=gpus_num input: - file=f'{translated}/mono_src/file.{{part}}',vocab=vocab_path, - teacher_models=expand(f"{final_teacher_dir}{{ens}}/{best_model}",ens=ensemble), + file=teacher_mono_source_file,vocab=vocab_path, + teacher_models=expand(f"{final_teacher_dir}{{{{model_index}}}}-{{ens}}/{best_model}",ens=ensemble), bin=ancient(decoder) - output: f'{translated}/mono_src/file.{{part}}.out' + output: file=teacher_mono_target_file params: args=get_args('decoding-teacher') - shell: '''bash pipeline/translate/translate.sh "{input.file}" "{input.vocab}" {input.teacher_models} \ + shell: '''bash pipeline/translate/translate.sh "{input.file}" "{output.file}" "{input.vocab}" {input.teacher_models} \ {params.args} >> {log} 2>&1''' -rule collect_mono_src: - message: "Collecting translated mono src dataset" - log: f"{log_dir}/collect_mono_src.log" - conda: "envs/base.yml" - threads: 4 - #group 'mono_src' - input: - lambda wildcards: expand(f"{translated}/mono_src/file.{{part}}.out", - part=find_parts(wildcards, checkpoints.split_mono_src)) - output: f'{translated}/mono.{trg}.gz' - params: src_mono=f"{clean}/mono.{src}.gz",dir=f'{translated}/mono_src' - shell: 'bash pipeline/translate/collect.sh "{params.dir}" "{output}" "{params.src_mono}" >> {log} 2>&1' +#If there are no mono src datasets, create dummy output files, since the merge step +#expects translated mono src files (TODO: separate deduping and shuffling from merge script +#to remove the need for this workaround) +if mono_src_datasets is None: + rule collect_mono_src_dummy: + message: "Collecting translated mono src dataset (dummy rule, used in case where no mono src datasets)" + log: f"{log_dir}/collect_mono_src.{{model_index}}.log" + conda: "envs/base.yml" + threads: 1 + #group 'mono_src' + params: src_mono=f"{clean}/mono.{src}.gz",dir=f'{translated}/mono_src' + output: trg_mono=f'{translated}/mono.{{model_index}}.{trg}.gz' + shell: 'touch {output.trg_mono} >> {log} 2>&1' + rule mono_src_dummy: + message: "Creating mono src dataset (dummy rule, used in case where no mono src datasets)" + log: f"{log_dir}/create_mono_src.log" + conda: "envs/base.yml" + threads: 1 + #group 'mono_src' + params: src_mono=f"{clean}/mono.{src}.gz",dir=f'{translated}/mono_src' + output: src_mono=f"{clean}/mono.{src}.gz" + shell: 'touch {output.src_mono} >> {log} 2>&1' +else: + rule collect_mono_src: + message: "Collecting translated mono src dataset" + log: f"{log_dir}/collect_mono_src.{{model_index}}.log" + conda: "envs/base.yml" + threads: 4 + wildcard_constraints: + model_index="\d+" + #group 'mono_src' + input: + lambda wildcards: expand(f'{translated}/mono_src/file.{{part}}.{wildcards.model_index}.out', + part=find_parts(wildcards, checkpoints.split_mono_src)) + output: f'{translated}/mono.{{model_index}}.{trg}.gz' + params: src_mono=f"{clean}/mono.{src}.gz",dir=f'{translated}/mono_src' + shell: 'bash pipeline/translate/collect-mono.sh "{params.dir}" "{output}" "{params.src_mono}" {wildcards.model_index} >> {log} 2>&1' # merge @@ -625,18 +828,53 @@ rule merge_translated: log: f"{log_dir}/merge_translated.log" conda: "envs/base.yml" threads: 4 + resources: mem_mb=64000 #group 'mono_src' input: - src1=clean_corpus_src,src2=f"{clean}/mono.{src}.gz", - trg1=rules.collect_corpus.output,trg2=rules.collect_mono_src.output, + src1=clean_corpus_src, + src2=f"{clean}/mono.{src}.gz", + trg1=lambda wildcards: expand(f"{translated}/corpus.{{model_index}}.{trg}.gz",model_index=model_indices), + trg2=lambda wildcards: expand(f"{translated}/mono.{{model_index}}.{trg}.gz",model_index=model_indices), bin=ancient(deduper) output: res_src=f'{merged}/corpus.{src}.gz',res_trg=f'{merged}/corpus.{trg}.gz' + params: + trg1_template=f"{translated}/corpus.model_index.{trg}.gz", + trg2_template=f"{translated}/mono.model_index.{trg}.gz" shell: '''bash pipeline/translate/merge-corpus.sh \ - "{input.src1}" "{input.src2}" "{input.trg1}" "{input.trg2}" "{output.res_src}" "{output.res_trg}" \ - >> {log} 2>&1''' + "{input.src1}" "{input.src2}" "{params.trg1_template}" "{params.trg2_template}" \ + "{output.res_src}" "{output.res_trg}" {model_indices} >> {log} 2>&1''' # train student +# preprocess source and target when scoring with opusmt model (note that deseg is not required, since +# scoring produces just scores) +if opusmt_backward: + score_source = f"{merged}/corpus.{src}.opusmt.gz" + score_target = f"{merged}/corpus.{trg}.opusmt.gz" +else: + score_source = rules.merge_translated.output.res_src + score_target = rules.merge_translated.output.res_trg + +#preprocess corpus before scoring, note that since the scoring is done with the +#backward model, source should be segmented with target.spm and vice versa +rule opusmt_preprocess_for_scoring: + message: "Preprocessing source file for OPUS-MT model" + log: f"{log_dir}/opusmt_preprocess_corpus/preprocess_for_scoring.log" + conda: "envs/base.yml" + threads: 1 + resources: mem_mb=64000 + input: + res_src=rules.merge_translated.output.res_src, + res_trg=rules.merge_translated.output.res_trg, + model=f'{backward_dir}/{best_model}', + spm_encoder=ancient(spm_encoder) + output: opusmt_source=f"{merged}/corpus.{src}.opusmt.gz", + opusmt_target=f"{merged}/corpus.{trg}.opusmt.gz" + shell: '''bash pipeline/translate/opusmt-preprocess.sh \ + {input.res_src} {input.model} src "target.spm" {input.spm_encoder} {target_language_token} && \ + bash pipeline/translate/opusmt-preprocess.sh \ + {input.res_trg} {input.model} trg "source.spm" {input.spm_encoder} {source_language_token} >> {log} 2>&1''' + rule score: message: "Scoring" log: f"{log_dir}/score.log" @@ -645,12 +883,12 @@ rule score: resources: gpu=gpus_num input: ancient(scorer), - model=f'{backward_dir}/{best_model}', vocab=vocab_path, - src_corpus=rules.merge_translated.output.res_src, trg_corpus=rules.merge_translated.output.res_trg + model=f'{backward_dir}/{best_model}', vocab=backward_vocab, + src_corpus=score_source, trg_corpus=score_target output: f"{filtered}/scores.txt" params: input_prefix=f'{merged}/corpus' shell: '''bash pipeline/cefilter/score.sh \ - "{input.model}" "{input.vocab}" "{params.input_prefix}" "{output}" >> {log} 2>&1''' + "{input.model}" "{input.vocab}" "{input.src_corpus}" "{input.trg_corpus}" "{output}" >> {log} 2>&1''' rule ce_filter: message: "Cross entropy filtering" @@ -724,7 +962,7 @@ rule finetune_student: rule quantize: message: "Quantization" - log: f"{log_dir}/quntize.log" + log: f"{log_dir}/quantize.log" conda: "envs/base.yml" threads: 1 input: @@ -769,7 +1007,7 @@ rule evaluate: data=multiext(f'{eval_data_dir}/{{dataset}}',f".{src}.gz",f".{trg}.gz"), models=lambda wildcards: f'{models_dir}/{wildcards.model}/{best_model}' if wildcards.model != 'teacher-ensemble' - else [f'{final_teacher_dir}{ens}/{best_model}' for ens in ensemble] + else [f'{final_teacher_dir}0-{ens}/{best_model}' for ens in ensemble] output: report(f'{eval_res_dir}/{{model}}/{{dataset}}.metrics', category='evaluation', subcategory='{model}', caption='reports/evaluation.rst') @@ -780,7 +1018,7 @@ rule evaluate: trg_lng=lambda wildcards: trg if wildcards.model != 'backward' else src, decoder_config=lambda wildcards: f'{models_dir}/{wildcards.model}/{best_model}.decoder.yml' if wildcards.model != 'teacher-ensemble' - else f'{final_teacher_dir}0/{best_model}.decoder.yml' + else f'{final_teacher_dir}0-0/{best_model}.decoder.yml' shell: '''bash pipeline/eval/eval-gpu.sh "{params.res_prefix}" "{params.dataset_prefix}" \ {params.src_lng} {params.trg_lng} "{params.decoder_config}" {input.models} >> {log} 2>&1''' diff --git a/configs/config.opusmt-multilingual-test.yml b/configs/config.opusmt-multilingual-test.yml new file mode 100644 index 000000000..df8832134 --- /dev/null +++ b/configs/config.opusmt-multilingual-test.yml @@ -0,0 +1,87 @@ +#### +# Example of a production config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + +image: Ftt.sif + +experiment: + name: opusmt + src: en + trg: sw + src_three_letter: eng + trg_three_letter: swa + + #OPUS models are not ensembled, they have different vocabs anyway + teacher-ensemble: 1 + + #URL to the OPUS-MT model to use as the teacher + opusmt-teacher: "https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.zip" + #URL to the OPUS-MT model to use as the backward modeli + #NOTE: this should probably be a different model from the teacher, teacher is used here just for testing + opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.zip" + #Only specify this if model is target-multilingual + target-language-token: "deu" + + # path to a pretrained backward model (optional) + backward-model: "" + + # limits per downloaded dataset + mono-max-sentences-src: 100000000 + mono-max-sentences-trg: 20000000 + + # split corpus to parallelize translation + split-length: 2000000 + + best-model: perplexity + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + tc_Tatoeba-train-v2021-08-07: 0.5 + +marian-args: + training-backward: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-teacher-base: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-teacher-finetuned: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-student: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-student-finetuned: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + decoding-backward: + mini-batch-words: 2000 + decoding-teacher: + mini-batch-words: 1000 + precision: float16 + mini-batch: 512 + + +#TODO: extract this info straight from the OPUS model yml info file +datasets: + # parallel training corpus + train: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets to merge for validation while training + devtest: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets for evaluation + test: + - tc_Tatoeba-Challenge-v2021-08-07 diff --git a/configs/config.opusmt-multimodel-test.yml b/configs/config.opusmt-multimodel-test.yml new file mode 100644 index 000000000..fb3fd6e4f --- /dev/null +++ b/configs/config.opusmt-multimodel-test.yml @@ -0,0 +1,108 @@ +#### +# Example of a multiteacher config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + +image: Ftt.sif + +experiment: + name: opusmt-multimodel-test + src: en + trg: sv + src_three_letter: eng + trg_three_letter: swe + + #OPUS models are not ensembled, they have different vocabs anyway + teacher-ensemble: 1 + + #URL to the OPUS-MT model to use as the teacher + opusmt-teacher: + - "https://object.pouta.csc.fi/Tatoeba-MT-models/gem-gem/opus-2020-10-04.zip" + - "https://object.pouta.csc.fi/Tatoeba-MT-models/eng-swe/opus+bt-2021-04-14.zip" + + #You only need to specify this if any model is target-multilingual + target-language-token: "swe" + + #URL to the OPUS-MT model to use as the backward model + #NOTE: this should probably be a different model from the teacher, teacher is used here just for testing + opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/gem-gem/opus-2020-10-04.zip" + + #You only need to specify this if backward model is target-multilingual + source-language-token: "eng" + + # path to a pretrained backward model (optional) + backward-model: "" + + # limits per downloaded dataset + mono-max-sentences-src: 10000 + mono-max-sentences-trg: 20000 + + # parallel data limit (mainly used for downsizing testing runs, to make GPU steps quicker). + # Comment out or specify "inf" to use all parallel data + parallel-max-sentences: 10000 + + # if using very small parallel corpus, also remember to set a low spm size, otherwise + # spm-encode fails due to not having enough subwords + spm-vocab-size: 1000 + + # split corpus to parallelize translation + split-length: 2000000 + + best-model: perplexity + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + tc_Tatoeba-train-v2021-08-07: 0.5 + +marian-args: + training-backward: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-teacher-base: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-teacher-finetuned: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-student: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-student-finetuned: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + decoding-backward: + mini-batch-words: 2000 + decoding-teacher: + mini-batch-words: 1000 + precision: float16 + mini-batch: 512 + +#marian-args: +# decoding-teacher: + # 2080ti or newer +# precision: float16 + +#TODO: extract this info straight from the OPUS model yml info file +datasets: + # parallel training corpus + train: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets to merge for validation while training + devtest: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets for evaluation + test: + - tc_Tatoeba-Challenge-v2021-08-07 + mono-src: + - news-crawl_news.2020 diff --git a/configs/config.opusmt-multimodel.yml b/configs/config.opusmt-multimodel.yml new file mode 100644 index 000000000..76c0a3bce --- /dev/null +++ b/configs/config.opusmt-multimodel.yml @@ -0,0 +1,88 @@ +#### +# Example of a multiteacher config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + +image: Ftt.sif + +experiment: + name: opusmt-multimodel-prod + src: en + trg: sv + src_three_letter: eng + trg_three_letter: swe + + #OPUS models are not ensembled, they have different vocabs anyway + teacher-ensemble: 1 + + #URL to the OPUS-MT model to use as the teacher + opusmt-teacher: + - "https://object.pouta.csc.fi/Tatoeba-MT-models/gem-gem/opus-2020-10-04.zip" + - "https://object.pouta.csc.fi/Tatoeba-MT-models/eng-swe/opus+bt-2021-04-14.zip" + + #You only need to specify this if any model is target-multilingual + target-language-token: "swe" + + #URL to the OPUS-MT model to use as the backward model + opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/swe-eng/opus-2021-02-12.zip" + + #You only need to specify this if backward model is target-multilingual + source-language-token: "eng" + + # path to a pretrained backward model (optional) + backward-model: "" + + # limits per downloaded dataset + mono-max-sentences-src: 10000000 + mono-max-sentences-trg: 2000000 + + # parallel data limit (mainly used for downsizing testing runs, to make GPU steps quicker). + # Comment out or specify "inf" to use all parallel data + # parallel-max-sentences: 10000 + + # vocab training sample + spm-sample-size: 10000000 + + + # split corpus to parallelize translation + split-length: 2000000 + + best-model: perplexity + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + tc_Tatoeba-train-v2021-08-07: 0.3 + +marian-args: +# these configs override pipeline/train/configs + training-backward: + # change based on available training data + after: 10e + training-teacher-base: + # remove for low resource languages or if training without augmentation + after: 2e +# these configs override pipeline/translate/decoder.yml + decoding-backward: + # 12 Gb GPU, s2s model + mini-batch-words: 2000 + beam-size: 12 + decoding-teacher: + # 12 Gb GPU, ensemble of 2 teachers + mini-batch-words: 1000 + # 2080ti or newer + precision: float16 + mini-batch: 512 + +datasets: + # parallel training corpus + train: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets to merge for validation while training + devtest: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets for evaluation + test: + - tc_Tatoeba-Challenge-v2021-08-07 + mono-src: + - news-crawl_news.2020 diff --git a/configs/config.opusmt-test.yml b/configs/config.opusmt-test.yml new file mode 100644 index 000000000..c8e3f055a --- /dev/null +++ b/configs/config.opusmt-test.yml @@ -0,0 +1,85 @@ +#### +# Example of a production config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + +image: Ftt.sif + +experiment: + name: opusmt + src: en + trg: fi + src_three_letter: eng + trg_three_letter: fin + + #OPUS models are not ensembled, they have different vocabs anyway + teacher-ensemble: 1 + + #URL to the OPUS-MT model to use as the teacher + opusmt-teacher: "best" + #URL to the OPUS-MT model to use as the backward model + opusmt-backward: "best" + + # path to a pretrained backward model (optional) + backward-model: "" + + # limits per downloaded dataset + mono-max-sentences-src: 100000000 + mono-max-sentences-trg: 20000000 + + parallel-max-sentences: 100000 + + # split corpus to parallelize translation + split-length: 2000000 + + best-model: perplexity + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + opus_GNOME/v1: 0.5 + +marian-args: + training-backward: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-teacher-base: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-teacher-finetuned: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-student: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + training-student-finetuned: + disp-freq: 10 + save-freq: 100 + valid-freq: 100 + after: 500u + decoding-backward: + mini-batch-words: 2000 + decoding-teacher: + mini-batch-words: 1000 + +#marian-args: +# decoding-teacher: + # 2080ti or newer + precision: float16 + +#TODO: extract this info straight from the OPUS model yml info file +datasets: + train: + - opus_GNOME/v1 + devtest: + - sacrebleu_wmt19 + test: + - sacrebleu_wmt19 diff --git a/configs/config.opusmt.yml b/configs/config.opusmt.yml new file mode 100644 index 000000000..4e4dbb6e2 --- /dev/null +++ b/configs/config.opusmt.yml @@ -0,0 +1,55 @@ +#### +# Example of a production config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + +image: Ftt.sif + +experiment: + name: opusmt + src: en + trg: fi + src_three_letter: eng + trg_three_letter: fin + + #OPUS models are not ensembled, they have different vocabs anyway + teacher-ensemble: 1 + + #URL to the OPUS-MT model to use as the teacher + opusmt-teacher: "https://object.pouta.csc.fi/Tatoeba-MT-models/eng-fin/opusTCv20210807+bt-2021-09-01.zip" + #URL to the OPUS-MT model to use as the backward model + opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/fin-eng/opusTCv20210807+bt-2021-08-25.zip" + + # path to a pretrained backward model (optional) + backward-model: "" + + # limits per downloaded dataset + mono-max-sentences-src: 100000000 + mono-max-sentences-trg: 20000000 + + # split corpus to parallelize translation + split-length: 2000000 + + best-model: perplexity + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + tc_Tatoeba-train-v2021-08-07.eng.fin: 0.5 + +marian-args: + decoding-teacher: + precision: float16 + mini-batch: 512 + +#TODO: extract this info straight from the OPUS model yml info file +datasets: + # parallel training corpus + train: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets to merge for validation while training + devtest: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets for evaluation + test: + - tc_Tatoeba-Challenge-v2021-08-07 diff --git a/configs/config.prod.yml b/configs/config.prod.yml index c3d54976a..c6722ea6d 100644 --- a/configs/config.prod.yml +++ b/configs/config.prod.yml @@ -149,4 +149,3 @@ datasets: - news-crawl_news.2016 - news-crawl_news.2015 - diff --git a/configs/config.test.yml b/configs/config.test.yml index 1744d8dfe..6d2ecf3a3 100644 --- a/configs/config.test.yml +++ b/configs/config.test.yml @@ -16,6 +16,8 @@ experiment: split-length: 100000 spm-sample-size: 100000 + parallel-max-sentences: 100000 + best-model: chrf bicleaner: diff --git a/docs/opusmt.md b/docs/opusmt.md new file mode 100644 index 000000000..b793e05b6 --- /dev/null +++ b/docs/opusmt.md @@ -0,0 +1,58 @@ +# OPUS-MT integration +The integration with OPUS-MT is based on [GreenNLP fork](https://github.com/GreenNLP/firefox-translations-training). +This fork makes it possible to use OPUS-MT models as teacher and backward models in the _firefox-translations-training_ pipeline (FTT). Other additions are profiles for running jobs on CSC supercomputers (*puhti*, *lumi* and *mahti*) and code for monitoring the power usage of jobs. + +# Workflow changes +- Added download rule for Tatoeba-Challenge data. +- Added download rule for OPUS-MT models (tested with Tatoeba-Challenge models, old models might need some changes) +- Added config parameters for specifying OPUS-MT models as teacher and/or backward model. +- Added subword segmentation and desegmentation rules. + +# Subword segmentation issues +The biggest incompatibility with OPUS-MT models and FTT is in subword segmentation: default FTT trains models that use the in-built sentencepiece support in Marian, while OPUS-MT models expect data to be pre-segmented. To make it possible to use both the default FTT training and pre-built OPUS-MT models, segmentation and desegmentation steps have been added around marian-specific rules. This causes some clutter, but it's probably the best solution (instead of e.g. doing the segmentation/desegmentation inside the marian scripts), since it also makes it possible to easily implement other subword segmentation methods in the workflow. + + +# Snakemake and conda on HPC +FTT is based on Snakemake, which has many benefits in terms of reproducibility and existing support. Among other things, Snakemake supports HPC environments and SLURM out of the box, which should make it ideal for CSC machines. However, Snakemake also makes heavy use of conda, which has been deprecated on CSC machines due to its unsuitability for HPC file systems (https://docs.csc.fi/computing/usage-policy/#conda-installations), and FTT specifically relies on several conda environments. Fortunately, Snakemake has a functionality for containerizing conda environments, so all the conda environments needed by FTT can be provided in an Apptainer container (Ftt.sif). + +Containerization does not entirely solve the conda problem, since the Snakemake program itself requires conda to run. CSC provides a snakemake module, but problematically these modules are container-based, and since containers cannot be nested on CSC machines, it is not possible to use containerized conda environments with the CSC snakemake modules. This can be solved by installing Snakemake with pip (this is discouraged in the Snakemake documentation, but I have seen no problems so far). + +# Non-containerized software +FTT uses software that is not included in the containerized conda environments, including several marian installations and other NLP tools. These are automatically built as part of the pipeline. The Ftt.sif container includes the prerequisites for the software components. It's also possible to provide paths to separately built software installations. + +# Getting started on CSC's puhti and mahti +1. Clone the repository. +2. Download the Ftt.sif container to the repository root. +3. Create a virtual Python environment for Snakemake (e.g. in the parent dir of the repository): + 1. The environment needs to be created with a non-containerized python, as otherwise Apptainer integration will not work. On puhti and mahti, the python executables in /usr/bin/ should work: `/usr/bin/python3.9 -m venv snakemake_env`. + 2. Activate the virtual environment: `source ./snakemake_env/bin/activate`. + 3. Install snakemake: `pip install snakemake`. +4. Install micromamba (e.g. in the parent dir of the repository): `curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba` +5. Return to the repository directory and update Git submodules: `make git-modules` +6. Create a _data_ directory (e.g. in the parent dir of the repository) and create a _tmp_ dir in it. +7. If the data directory is not located in the parent directory of the repository, edit _profiles/slurm-puhti/config.yaml_ or _profiles/slurm-mahti/config.yaml_ and change the bindings in the singularity-args section to point to your data directory, and also enter the _data_ directory path as the _root_ value of the _config_ section. +8. Edit profiles/slurm-puhti/config.cluster.yaml to change the CSC account to one you have access to. +9. Load cuda modules: module load gcc/9.4.0 cuda cudnn +10. Run pipeline: `make run-hpc PROFILE="slurm-puhti"` or `make run PROFILE="slurm-mahti"` + +# Getting started on CSC's lumi +1. Clone the repository. +2. Download the Ftt.sif container to the repository root. +3. Create a virtual Python environment for Snakemake (e.g. in the parent dir of the repository): + 1. The environment needs to be created with a non-containerized python, as otherwise Apptainer integration will not work. On lumi, use the _cray-python_ module (it is not containerized): `module load cray-python; python -m venv snakemake_env`. + 2. Activate the virtual environment: `source ./snakemake_env/bin/activate`. + 3. Install snakemake: `pip install snakemake`. +4. Install micromamba (e.g. in the parent dir of the repository): `curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba` +5. Return to the repository directory and update Git submodules: `make git-modules` +6. Create a _data_ directory (e.g. in the parent dir of the repository) and create a _tmp_ dir in it. +7. If the data directory is not located in the parent directory of the repository, edit profiles/slurm-lumi/config.yaml and change the bindings in the singularity-args section to point to your data directory, and also enter the _data_ directory path as the _root_ value of the _config_ section. +8. Edit profiles/slurm-puhti/config.cluster.yaml to change the CSC account to one you have access to. +9. Load rocm module: module load rocm. +10. Copy the marian executables to _3rd_party/lumi-marian/build_ (compiling lumi-marian is currently hacky, so this workaround makes things easier). +11. Enter _export SINGULARITYENV_LD_LIBRARY_PATH=$LD_LIBRARY_PATH_ to make sure Marian can find all the libraries when it runs containerized. +12. Run pipeline: `make run-hpc PROFILE="slurm-puhti"` + +# Testing +Since running the whole pipeline for a high-resource language pair will take a long time, there is a test config available for testing that everything works as it should. The test config is used by default, you can change into the full config by modifying the Makefile and changing config.opusmt-test.yml to config.opusmt.yml. You can also provide the config on the command line as the CONFIG parameter with make. Note that even the test config will take a long time if the training corpus is large (since translating the training data will take time). So to do a quick functionality check, pick a language pair with as little data as possible in Tatoeba-Challenge (while still having trained forward and backward models). The default epo-afr is good for quick checking (although note that bicleaner step will be skipped, as there are no bicleaner packs for those languages). + +You can test the pipeline without running it by using make dry-run. If you want to build a specific file or rule, you can use the TARGET parameter with make. diff --git a/envs/base.yml b/envs/base.yml index acb023f89..ee7b4aee7 100644 --- a/envs/base.yml +++ b/envs/base.yml @@ -8,7 +8,7 @@ dependencies: - pip=21.2.2 - pip: - sacrebleu==2.0.0 - - mtdata==0.3.2 + - mtdata==0.4.0 - fasttext==0.9.2 - regex==2019.8.19 - sacremoses==0.0.43 diff --git a/envs/bicleaner-ai-lumi.yml b/envs/bicleaner-ai-lumi.yml new file mode 100644 index 000000000..bf448dc5b --- /dev/null +++ b/envs/bicleaner-ai-lumi.yml @@ -0,0 +1,11 @@ +name: bicleaner-ai +channels: + - conda-forge + - defaults +dependencies: + - python=3.9 + - pip==21.2.2 + - cmake=3.21.1 + - pip: + - bicleaner-ai==2.2.1 + - tensorflow-rocm==2.10.0.520 diff --git a/envs/bicleaner-ai.yml b/envs/bicleaner-ai.yml index 2fe482c74..bdc0a235d 100644 --- a/envs/bicleaner-ai.yml +++ b/envs/bicleaner-ai.yml @@ -3,8 +3,9 @@ channels: - conda-forge - defaults dependencies: - - python=3.7 + - python=3.9 - pip==21.2.2 - cmake=3.21.1 - pip: - - bicleaner-ai==1.0.1 \ No newline at end of file + - bicleaner-ai==2.2.1 + - tensorflow==2.6.5 diff --git a/envs/bicleaner.yml b/envs/bicleaner.yml index 23bedae2f..74ffcaed4 100644 --- a/envs/bicleaner.yml +++ b/envs/bicleaner.yml @@ -1,10 +1,13 @@ name: bicleaner channels: - conda-forge + - bitextor - defaults dependencies: - - python=3.7 - - pip==21.2.2 + - python=3.8 + - pip==23.0 - cmake=3.21.1 + - hunspell==1.7.0 - pip: - - bicleaner==0.14 \ No newline at end of file + - pypi-kenlm + - bicleaner==0.16.1 diff --git a/pipeline/bicleaner/bicleaner.sh b/pipeline/bicleaner/bicleaner.sh index baa49da9d..842612948 100755 --- a/pipeline/bicleaner/bicleaner.sh +++ b/pipeline/bicleaner/bicleaner.sh @@ -12,9 +12,10 @@ test -v SRC test -v TRG test -v CUDA_DIR test -v CUDNN_DIR +test -v ROCM_PATH -# cuda and cudnn libs -export LD_LIBRARY_PATH=${CUDA_DIR}/lib64:${CUDNN_DIR}:${LD_LIBRARY_PATH:+LD_LIBRARY_PATH:} +# cuda and cudnn or rocm libs +export LD_LIBRARY_PATH=${CUDA_DIR:+$CUDA_DIR/lib64:}${CUDNN_DIR:+$CUDNN_DIR/lib64:}${ROCM_PATH:+$ROCM_PATH/lib:}${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:} corpus_prefix=$1 output_prefix=$2 @@ -58,6 +59,7 @@ else export tcol=1 fi + #TODO: More than 1 GPU is not supported with AMD GPUs right now (usually 1 is enough, though, it's pretty fast). #Export cuda visible devices if empty or not set if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader); @@ -81,6 +83,12 @@ else paste <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${SRC}.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${TRG}.${ARTIFACT_EXT}") | parallel -j ${#CUDA_VISIBLE_ARRAY[@]} --pipe -k --block 10M biclean "${pack_dir}"/*.yaml {%} | ${COMPRESSION_CMD} >"${output_prefix}.scored.${ARTIFACT_EXT}" + elif [[ "${type}" == 'bicleaner-ai' ]]; then + #Turn on tensorflow logging in bicleaner-ai + export TF_CPP_MIN_LOG_LEVEL=0 + paste <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${SRC}.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${TRG}.${ARTIFACT_EXT}") | + ${cmd} --scol ${scol} --tcol ${tcol} - - "${pack_dir}"/*.yaml | + ${COMPRESSION_CMD} >"${output_prefix}.scored.${ARTIFACT_EXT}" else paste <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${SRC}.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${TRG}.${ARTIFACT_EXT}") | ${cmd} --scol ${scol} --tcol ${tcol} --processes "${threads}" - - "${pack_dir}"/*.yaml | diff --git a/pipeline/bicleaner/download-pack.sh b/pipeline/bicleaner/download-pack.sh index afd33d038..e8a7e607c 100755 --- a/pipeline/bicleaner/download-pack.sh +++ b/pipeline/bicleaner/download-pack.sh @@ -20,7 +20,9 @@ invalid_url() { } if [ "${type}" == 'bicleaner-ai' ]; then - url="https://github.com/bitextor/bicleaner-ai-data/releases/latest/download" + #bicleaner-ai v2.0 full models are only available through Hugging Face. + #TODO: change this code to download models from HF + url="https://github.com/bitextor/bicleaner-ai-data/releases/download/v1.0" prefix="full-" extension="tgz" elif [ "${type}" == 'bicleaner' ]; then diff --git a/pipeline/bicleaner/packs.py b/pipeline/bicleaner/packs.py index 4668b3534..a82a3e0de 100644 --- a/pipeline/bicleaner/packs.py +++ b/pipeline/bicleaner/packs.py @@ -1,7 +1,7 @@ import requests bicleaner = "https://github.com/bitextor/bicleaner-data/releases/latest/download" -bicleaner_ai = "https://github.com/bitextor/bicleaner-ai-data/releases/latest/download" +bicleaner_ai = "https://github.com/bitextor/bicleaner-ai-data/releases/download/v1.0" def _exists(url): diff --git a/pipeline/cefilter/score.sh b/pipeline/cefilter/score.sh index 429dc5ccf..742805552 100755 --- a/pipeline/cefilter/score.sh +++ b/pipeline/cefilter/score.sh @@ -16,17 +16,17 @@ test -v WORKSPACE model=$1 vocab=$2 -corpus_prefix=$3 -output=$4 +#note that target will be used as source, since scoring is done with backward model +source_path=$3 +target_path=$4 +output=$5 ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" - if [ "${ARTIFACT_EXT}" = "zst" ]; then - zstdmt --rm -d "${corpus_prefix}.${SRC}.${ARTIFACT_EXT}" - zstdmt --rm -d "${corpus_prefix}.${TRG}.${ARTIFACT_EXT}" - ARTIFACT_EXT="" -else - ARTIFACT_EXT=".gz" + zstdmt --rm -d "${source_path}.${SRC}.${ARTIFACT_EXT}" + zstdmt --rm -d "${target_path}.${TRG}.${ARTIFACT_EXT}" + source_path="${source_path}.${SRC}" + target_path="${target_path}.${TRG}" fi dir=$(dirname "${output}") @@ -35,7 +35,7 @@ mkdir -p "${dir}" "${MARIAN}/marian-scorer" \ -m "${model}" \ -v "${vocab}" "${vocab}" \ - -t "${corpus_prefix}.${TRG}${ARTIFACT_EXT}" "${corpus_prefix}.${SRC}${ARTIFACT_EXT}" \ + -t "${target_path}" "${source_path}" \ --mini-batch 32 \ --mini-batch-words 1500 \ --maxi-batch 1000 \ diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh index e344bce2b..de547dfa6 100755 --- a/pipeline/clean/clean-mono.sh +++ b/pipeline/clean/clean-mono.sh @@ -61,6 +61,7 @@ test -s "${output_prefix}.${lang}.langid.${ARTIFACT_EXT}" || ${COMPRESSION_CMD} >"${output_prefix}.${lang}.langid.${ARTIFACT_EXT}" ###################################################################### + echo "### Rule-based filtering" ${COMPRESSION_CMD} -dc "${output_prefix}.${lang}.langid.${ARTIFACT_EXT}" | diff --git a/pipeline/clean/merge-corpus.sh b/pipeline/clean/merge-corpus.sh index ca52ae8ca..ec92f9f59 100755 --- a/pipeline/clean/merge-corpus.sh +++ b/pipeline/clean/merge-corpus.sh @@ -13,7 +13,8 @@ test -v TRG test -v BIN output_prefix=$1 -inputs=( "${@:2}" ) +max_sents=$2 +inputs=( "${@:3}" ) COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" @@ -35,6 +36,12 @@ paste <(${COMPRESSION_CMD} -dc "${tmp}/corpus.${SRC}.dup.${ARTIFACT_EXT}") <(${C ${BIN}/dedupe | ${COMPRESSION_CMD} >"${tmp}.${SRC}${TRG}.${ARTIFACT_EXT}" +# if max sents not "inf", get the first n sents (this is mainly used for testing to make translation and training go faster) +if [ "${max_sents}" != "inf" ]; then + head -${max_sents} <(${COMPRESSION_CMD} -dc "${tmp}.${SRC}${TRG}.gz") | ${COMPRESSION_CMD} > "${tmp}.${SRC}${TRG}.truncated.gz" + mv "${tmp}.${SRC}${TRG}.truncated.gz" "${tmp}.${SRC}${TRG}.gz" +fi + ${COMPRESSION_CMD} -dc "${tmp}.${SRC}${TRG}.${ARTIFACT_EXT}" | cut -f1 | ${COMPRESSION_CMD} > "${output_prefix}.${SRC}.${ARTIFACT_EXT}" ${COMPRESSION_CMD} -dc "${tmp}.${SRC}${TRG}.${ARTIFACT_EXT}" | cut -f2 | ${COMPRESSION_CMD} > "${output_prefix}.${TRG}.${ARTIFACT_EXT}" diff --git a/pipeline/clean/tools/langid_fasttext.py b/pipeline/clean/tools/langid_fasttext.py index 8adde02d4..f0ec851f0 100755 --- a/pipeline/clean/tools/langid_fasttext.py +++ b/pipeline/clean/tools/langid_fasttext.py @@ -22,7 +22,6 @@ def main(): args = parse_user_args() - mpath = os.path.join(os.path.dirname(os.path.realpath(__file__)), BIN) if not os.path.exists(mpath): sys.stderr.write("Downloading model {} ...\n".format(URL)) @@ -33,8 +32,12 @@ def main(): model = fasttext.load_model(mpath) for line in sys.stdin: + # sys.stderr.write(line) + # sys.stderr.write(str(args.field)+"\n") fields = line.strip().split("\t") lid = model.predict(fields[args.field]) + # sys.stderr.write(str(lid)+'\n') + # sys.stderr.write("{}\t{}".format(lid[0][0][-2:], line)) sys.stdout.write("{}\t{}".format(lid[0][0][-2:], line)) diff --git a/pipeline/data/download-tc-data.sh b/pipeline/data/download-tc-data.sh new file mode 100644 index 000000000..e5c0905ec --- /dev/null +++ b/pipeline/data/download-tc-data.sh @@ -0,0 +1,55 @@ +#!/bin/bash +## +# Downloads Tatoeba Challenge data (train, devset and eval in same package) +# + +set -x +set -euo pipefail + +echo "###### Downloading Tatoeba-Challenge data" + +src_three_letter=$1 +trg_three_letter=$2 +src=$3 +trg=$4 +output_prefix=$5 +version=$6 +max_sents=$7 + +tmp="$(dirname "${output_prefix}")/${version}" +mkdir -p "${tmp}" + +archive_path="${tmp}/${version}-${src_three_letter}-${trg_three_letter}.tar" + +#try both combinations of language codes +if wget -O "${archive_path}" "https://object.pouta.csc.fi/${version}/${src_three_letter}-${trg_three_letter}.tar"; then + package_src=${src} + package_trg=${trg} +elif wget -O "${archive_path}" "https://object.pouta.csc.fi/${version}/${trg_three_letter}-${src_three_letter}.tar"; then + package_src=${trg} + package_trg=${src} +fi + +#extract all in same directory, saves the trouble of parsing directory structure +tar -xf "${archive_path}" --directory ${tmp} --strip-components 4 + + +# if max sents not -1, get the first n sents (this is mainly used for testing to make translation and training go faster) +if [ "${max_sents}" != "inf" ]; then + head -${max_sents} <(pigz -dc "${tmp}/train.src.gz") | pigz > "${output_prefix}/corpus/tc_${version}.${package_src}.gz" + head -${max_sents} <(pigz -dc "${tmp}/train.trg.gz") | pigz > "${output_prefix}/corpus/tc_${version}.${package_trg}.gz" +else + mv ${tmp}/train.src.gz ${output_prefix}/corpus/tc_${version}.${package_src}.gz + mv ${tmp}/train.trg.gz ${output_prefix}/corpus/tc_${version}.${package_trg}.gz +fi + +cat ${tmp}/dev.src | gzip > ${output_prefix}/devset/tc_${version}.${package_src}.gz +cat ${tmp}/dev.trg | gzip > ${output_prefix}/devset/tc_${version}.${package_trg}.gz + +cat ${tmp}/test.src | gzip > ${output_prefix}/eval/tc_${version}.${package_src}.gz +cat ${tmp}/test.trg | gzip > ${output_prefix}/eval/tc_${version}.${package_trg}.gz + +rm -rf "${tmp}" + + +echo "###### Done: Downloading Tatoeba-Challenge data" diff --git a/pipeline/data/importers/corpus/flores.sh b/pipeline/data/importers/corpus/flores.sh index b2d4daabe..088ee14a4 100755 --- a/pipeline/data/importers/corpus/flores.sh +++ b/pipeline/data/importers/corpus/flores.sh @@ -30,6 +30,8 @@ flores_code() { flores_code="zho_simpl" elif [ "${code}" == "zh-Hant" ]; then flores_code="zho_trad" + elif [ "${code}" == "sw" ]; then + flores_code="swh" else flores_code=$(python3 -c "from mtdata.iso import iso3_code; print(iso3_code('${code}', fail_error=True))") fi diff --git a/pipeline/data/importers/mono/news-crawl.sh b/pipeline/data/importers/mono/news-crawl.sh index 47912b90c..eb5ce6fda 100755 --- a/pipeline/data/importers/mono/news-crawl.sh +++ b/pipeline/data/importers/mono/news-crawl.sh @@ -15,7 +15,7 @@ ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" echo "###### Downloading WMT newscrawl monolingual data" -curl -L "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.deduped.gz" | \ +wget -qO- "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.deduped.gz" | \ gunzip | ${COMPRESSION_CMD} -c > "${output_prefix}.${ARTIFACT_EXT}" echo "###### Done: Downloading WMT newscrawl monolingual data" diff --git a/pipeline/opusmt/download-model.sh b/pipeline/opusmt/download-model.sh new file mode 100644 index 000000000..d1d5ed4ed --- /dev/null +++ b/pipeline/opusmt/download-model.sh @@ -0,0 +1,44 @@ +#!/bin/bash +## +# Downloads a pretrained opus mt (or tatoeba-challenge) model +# + +set -x +set -euo pipefail + +echo "###### Downloading pretrained opus model" + +download_url=$1 + +model_dir=$2 +best_model=$3 +source_lang=$4 +target_lang=$5 + +#if download url is best, find the best model from list +if [[ $download_url = "best" ]] +then + model_list="${model_dir}/released-model-results.txt" + wget -O ${model_list} "https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master/models/released-model-results.txt" + download_url=$(grep -P -m 1 "^${source_lang}-${target_lang}" ${model_list} | cut -f 4) + echo "###### Using best ${source_lang}-${target_lang} model ${download_url}" +fi + +model_zip=${download_url##*/} +archive_path="${model_dir}/${model_zip}" + +wget -O "${archive_path}" "${download_url}" + +cd ${model_dir} +unzip -j -o "${archive_path}" +rm ${archive_path} + +model_file=$(ls *.npz) +vocab_file=$(ls *vocab.yml) +#Create a soft link for the model with the name that the workflow expects +ln -s $model_file ${best_model} +#Also create a standard name link for the vocab +ln -s $vocab_file "vocab.yml" + + +echo "###### Done: Downloading and extracting opus mt model" diff --git a/pipeline/setup/compile-browsermt-marian-dev.sh b/pipeline/setup/compile-browsermt-marian-dev.sh new file mode 100644 index 000000000..62f9ecb1e --- /dev/null +++ b/pipeline/setup/compile-browsermt-marian-dev.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## +# Installs and compiles marian +# + +set -x +set -euo pipefail + +echo "###### Compiling marian" + +test -v CUDA_DIR + +marian_dir=$1 +threads=$2 +extra_args=( "${@:3}" ) + +mkdir -p "${marian_dir}" +cd "${marian_dir}" +cmake .. -DUSE_SENTENCEPIECE=on -DUSE_FBGEMM=on -DCOMPILE_CUDA=off -DCOMPILE_CPU=on -DCMAKE_BUILD_TYPE=Release \ + "${extra_args[@]}" +make -j "${threads}" + +echo "###### Done: Compiling marian" diff --git a/pipeline/setup/compile-marian.sh b/pipeline/setup/compile-marian-dev.sh similarity index 100% rename from pipeline/setup/compile-marian.sh rename to pipeline/setup/compile-marian-dev.sh diff --git a/pipeline/setup/install-deps.sh b/pipeline/setup/install-deps.sh index d9fc8a920..d0f720293 100755 --- a/pipeline/setup/install-deps.sh +++ b/pipeline/setup/install-deps.sh @@ -13,9 +13,11 @@ apt-get update echo "### Installing extra dependencies" apt-get install -y pigz htop wget unzip parallel bc git +echo "### Installing bicleaner dependencies" +apt-get install -y libhunspell-dev + echo "### Installing marian dependencies" -apt-get install -y build-essential libboost-system-dev libprotobuf10 \ - protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev +apt-get install -y git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev echo "### Installing Intel MKL" wget -qO- 'https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB' | apt-key add - diff --git a/pipeline/setup/install-kenlm.sh b/pipeline/setup/install-kenlm.sh index a566e8b3d..11387e645 100755 --- a/pipeline/setup/install-kenlm.sh +++ b/pipeline/setup/install-kenlm.sh @@ -21,6 +21,6 @@ cmake .. -DKENLM_MAX_ORDER=7 -DCMAKE_INSTALL_PREFIX:PATH="${BIN}/kenlm" make -j "${threads}" install cd .. -python -m pip install . --install-option="--max_order 7" +python -m pip install --user . --install-option="--max_order 7" echo "###### Done: Installing kenlm" diff --git a/pipeline/translate/collect-mono.sh b/pipeline/translate/collect-mono.sh new file mode 100644 index 000000000..eb96f6b3f --- /dev/null +++ b/pipeline/translate/collect-mono.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +# Merges translation outputs into a dataset +# + +set -x +set -euo pipefail + + +dir=$1 +output_path=$2 +mono_path=$3 +model_index="${4:-""}" + +echo "### Collecting translations" +cat "${dir}"/*${model_index}.out | pigz >"${output_path}" + +echo "### Comparing number of sentences in source and artificial target files" +src_len=$(pigz -dc "${mono_path}" | wc -l) +trg_len=$(pigz -dc "${output_path}" | wc -l) +if [ "${src_len}" != "${trg_len}" ]; then + echo "### Error: length of ${mono_path} ${src_len} is different from ${output_path} ${trg_len}" + exit 1 +fi diff --git a/pipeline/translate/collect.sh b/pipeline/translate/collect.sh index 3417464f4..9c53d82c3 100755 --- a/pipeline/translate/collect.sh +++ b/pipeline/translate/collect.sh @@ -10,12 +10,12 @@ set -euo pipefail dir=$1 output_path=$2 mono_path=$3 - +model_index="${4:-""}" COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" echo "### Collecting translations" -cat "${dir}"/*.out | ${COMPRESSION_CMD} >"${output_path}" +cat "${dir}"/*${model_index}.out | ${COMPRESSION_CMD} >"${output_path}" echo "### Comparing number of sentences in source and artificial target files" src_len=$(${COMPRESSION_CMD} -dc "${mono_path}" | wc -l) diff --git a/pipeline/translate/decoder.yml b/pipeline/translate/decoder.yml index 4b2b2d3df..c6c47d2fb 100644 --- a/pipeline/translate/decoder.yml +++ b/pipeline/translate/decoder.yml @@ -1,10 +1,13 @@ normalize: 1.0 word-penalty: 0 mini-batch: 16 -mini-batch-words: 500 maxi-batch: 1000 maxi-batch-sort: src +mini-batch-words: 500 max-length: 200 max-length-crop: true beam-size: 8 -quiet-translation: True \ No newline at end of file +quiet-translation: true +#upgrade marian to enable this +#data-threads: 8 +#log-level: debug diff --git a/pipeline/translate/merge-corpus.sh b/pipeline/translate/merge-corpus.sh index 009a72a05..129fd73e8 100755 --- a/pipeline/translate/merge-corpus.sh +++ b/pipeline/translate/merge-corpus.sh @@ -22,10 +22,11 @@ echo "###### Merging datasets" src1=$1 src2=$2 -trg1=$3 -trg2=$4 +trg1_template=$3 +trg2_template=$4 res_src=$5 res_trg=$6 +model_indices=("${@:7}") COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" @@ -33,8 +34,28 @@ ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" tmp_dir="$(dirname "${res_src}")/tmp" mkdir -p "${tmp_dir}" -cat <(${COMPRESSION_CMD} -dc "${src1}") <(${COMPRESSION_CMD} -dc "${src2}") | ${COMPRESSION_CMD} >"${tmp_dir}/original.src.${ARTIFACT_EXT}" -cat <(${COMPRESSION_CMD} -dc "${trg1}") <(${COMPRESSION_CMD} -dc "${trg2}") | ${COMPRESSION_CMD} >"${tmp_dir}/original.trg.${ARTIFACT_EXT}" +#opusmt +if [ ! -z "${model_indices}" ]; then + # merge output from different teachers + for model_index in "${model_indices[@]}" + do + ${COMPRESSION_CMD} -dc "${src1}" >> "${tmp_dir}/original.src" + ${COMPRESSION_CMD} -dc "${trg1_template/model_index/"$model_index"}" >> "${tmp_dir}/original.trg" + # mono src might be empty + if [ -s ${src2} ]; then + ${COMPRESSION_CMD} -dc "${src2}" >> "${tmp_dir}/original.src" + ${COMPRESSION_CMD} -dc "${trg2_template/model_index/"$model_index"}" >> "${tmp_dir}/original.trg" + fi + done + ${COMPRESSION_CMD} "${tmp_dir}/original.src" + ${COMPRESSION_CMD} "${tmp_dir}/original.trg" +# task cluster +else + cat <(${COMPRESSION_CMD} -dc "${src1}") <(${COMPRESSION_CMD} -dc "${src2}") | + ${COMPRESSION_CMD} >"${tmp_dir}/original.src.${ARTIFACT_EXT}" + cat <(${COMPRESSION_CMD} -dc "${trg1_template}") <(${COMPRESSION_CMD} -dc "${trg2_template}") | + ${COMPRESSION_CMD} >"${tmp_dir}/original.trg.${ARTIFACT_EXT}" +fi echo "#### Deduplicating" paste <(${COMPRESSION_CMD} -dc "${tmp_dir}/original.src.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${tmp_dir}/original.trg.${ARTIFACT_EXT}") | diff --git a/pipeline/translate/opusmt-preprocess.sh b/pipeline/translate/opusmt-preprocess.sh new file mode 100644 index 000000000..2ed5d5f2d --- /dev/null +++ b/pipeline/translate/opusmt-preprocess.sh @@ -0,0 +1,49 @@ +#!/bin/bash +## +# Applies OPUS-MT preprocessing to corpus +# + +set -x +set -euo pipefail + + +source_file=$1 +opusmt_model=$2 +source_lang=$3 +spm_name=$4 +spm_encoder=$5 +export PATH=$PATH:$(dirname ${spm_encoder}) +model_dir=$(dirname $2) + +# When splits are preprocessed, different models need different preprocessing, +# so model index is given. Check for unset parameter. +if [ $# -ge 7 ]; then + model_index_suffix=".$7" +else + model_index_suffix="" +fi + + +#target_lang_token needs to be provided for multilingual models +#first check whether model is multilingual AND preprocessing isdone on source side (never language tags on target side) +if grep -q ">>id<<" "${model_dir}/README.md" && [ ${spm_name} == "source.spm" ]; then + target_lang_token=$6 + if [ -n "${target_lang_token}" ]; then + #add space after lang token + target_lang_token=">>${target_lang_token}<< " + else + echo "no target lang token provided" + exit 1 + fi +else + target_lang_token="" +fi + + +if [ "${source_file##*.}" == "gz" ]; then + echo "source file is gzipped" + zcat $1 | pipeline/translate/preprocess.sh $3 "${model_dir}/${spm_name}" | sed -e "s/^/${target_lang_token}/" | gzip > ${source_file%%.gz}${model_index_suffix}.opusmt.gz +else + echo "source file is not gzipped" + pipeline/translate/preprocess.sh $3 "${model_dir}/${spm_name}" < $1 | sed -e "s/^/${target_lang_token}/" > $1${model_index_suffix}.opusmt +fi diff --git a/pipeline/translate/preprocess.sh b/pipeline/translate/preprocess.sh new file mode 100755 index 000000000..9eb72bdd9 --- /dev/null +++ b/pipeline/translate/preprocess.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# +# USAGE preprocess.sh langid spmodel < input > output +# +# replace SPMENCODE with your own setup! +# +# CHANGES +# +# * issue with perl code that removes control characters +# unicode property Other = \p{C}) seems to remove +# newline characters as well --> add negative lookahead +# to avoid removing newline characters! +# +# +SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"` + +## simple pre-processing steps adapted from Moses tools + +sed -e 's/,/,/g' \ + -e 's/。 */. /g' \ + -e 's/、/,/g' \ + -e 's/”/"/g' \ + -e 's/“/"/g' \ + -e 's/∶/:/g' \ + -e 's/:/:/g' \ + -e 's/?/\?/g' \ + -e 's/《/"/g' \ + -e 's/》/"/g' \ + -e 's/)/\)/g' \ + -e 's/!/\!/g' \ + -e 's/(/\(/g' \ + -e 's/;/;/g' \ + -e 's/1/"/g' \ + -e 's/」/"/g' \ + -e 's/「/"/g' \ + -e 's/0/0/g' \ + -e 's/3/3/g' \ + -e 's/2/2/g' \ + -e 's/5/5/g' \ + -e 's/6/6/g' \ + -e 's/9/9/g' \ + -e 's/7/7/g' \ + -e 's/8/8/g' \ + -e 's/4/4/g' \ + -e 's/. */. /g' \ + -e 's/~/\~/g' \ + -e "s/’/\'/g" \ + -e 's/…/\.\.\./g' \ + -e 's/━/\-/g' \ + -e 's/〈/\/g' \ + -e 's/【/\[/g' \ + -e 's/】/\]/g' \ + -e 's/%/\%/g' | +perl -C -pe 's/(?!\n)\p{C}/ /g;' | +perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\ +sed 's/ */ /g;s/^ *//g;s/ *$//g' | +${SPMENCODE} --model $2 + diff --git a/pipeline/translate/translate-nbest.sh b/pipeline/translate/translate-nbest.sh index 97a66b30b..3cc7011f8 100755 --- a/pipeline/translate/translate-nbest.sh +++ b/pipeline/translate/translate-nbest.sh @@ -11,8 +11,20 @@ test -v MARIAN test -v WORKSPACE input=$1 -vocab=$2 -models=( "${@:3}" ) +output=$2 +models=( "${@:4}" ) +modeldir=$(dirname ${models}) + + +#if the model is an OPUS-MT model, use the model vocab instead of the defined forward vocab +for opus_vocab in ${modeldir}/opus*.vocab.yml; do + if [[ -f ${opus_vocab} ]]; then + vocab=${opus_vocab} + else + vocab=$3 + fi + break +done cd "$(dirname "${0}")" @@ -21,10 +33,10 @@ cd "$(dirname "${0}")" -m "${models[@]}" \ -v "${vocab}" "${vocab}" \ -i "${input}" \ - -o "${input}.nbest" \ + -o "${output}" \ --log "${input}.log" \ --n-best \ -d ${GPUS} \ -w "${WORKSPACE}" -test "$(wc -l <"${input}.nbest")" -eq "$(( $(wc -l <"${input}") * 8 ))" \ No newline at end of file +test "$(wc -l <"${output}")" -eq "$(( $(wc -l <"${input}") * 8 ))" diff --git a/pipeline/translate/translate.sh b/pipeline/translate/translate.sh index f046ae532..efb6e28dc 100755 --- a/pipeline/translate/translate.sh +++ b/pipeline/translate/translate.sh @@ -11,9 +11,19 @@ test -v MARIAN test -v WORKSPACE input=$1 -vocab=$2 -models=( "${@:3}" ) +output=$2 +models=( "${@:4}" ) +modeldir=$(dirname ${models}) +#if the model is an OPUS-MT model, use the model vocab instead of the defined forward vocab +for opus_vocab in ${modeldir}/opus*.vocab.yml; do + if [[ -f ${opus_vocab} ]]; then + vocab=${opus_vocab} + else + vocab=$3 + fi + break +done cd "$(dirname "${0}")" @@ -22,9 +32,9 @@ cd "$(dirname "${0}")" -m "${models[@]}" \ -v "${vocab}" "${vocab}" \ -i "${input}" \ - -o "${input}.out" \ + -o "${output}" \ --log "${input}.log" \ -d ${GPUS} \ -w "${WORKSPACE}" -test "$(wc -l <"${input}")" == "$(wc -l <"${input}.out")" +test "$(wc -l <"${input}")" == "$(wc -l <"${output}")" diff --git a/profiles/local-container-cpu/config.yaml b/profiles/local-container-cpu/config.yaml new file mode 100755 index 000000000..40339301c --- /dev/null +++ b/profiles/local-container-cpu/config.yaml @@ -0,0 +1,19 @@ +verbose: false +jobs: 1 +use-conda: true +resources: gpu=8 +cores: 1 +cache: false +reason: true +use-singularity: true +singularity-args: "--bind /home/tommi/greennlp/data/,/home/tommi/greennlp/data/temp/:/tmp --containall" +config: + - deps=true + - root=/home/tommi/greennlp/data + - workspace=12000 + - cuda=/cuda + - cudnn=/cudnn + - numgpus=8 + - mariancmake="" + # (optional) override available GPU ids, example gpus=0 2 5 6 + - gpus="" diff --git a/profiles/local-container/config.yaml b/profiles/local-container/config.yaml index 06bea96fa..9955de2e4 100755 --- a/profiles/local-container/config.yaml +++ b/profiles/local-container/config.yaml @@ -1,18 +1,14 @@ -verbose: false +verbose: true +use-singularity: true use-conda: true -resources: gpu=8 -cores: all +resources: gpu=1 +cores: 8 cache: false reason: true -use-singularity: true -singularity-args: "--bind /data,/usr/local/cuda,/usr/lib/x86_64-linux-gnu:/cudnn --nv" +singularity-args: "--bind $PWD/../data,$CUDA_INSTALL_ROOT,$CUDNN_INSTALL_ROOT,$PWD/../data/tmp:/tmp --nv --containall" config: - - deps=true - - root=/data - - cuda=/usr/local/cuda - - cudnn=/cudnn - - workspace=12000 - - numgpus=8 + - deps=false + - workspace=10000 + - numgpus=1 - mariancmake="" - # (optional) override available GPU ids, example gpus=0 2 5 6 - - gpus="" \ No newline at end of file + - gpus="" diff --git a/profiles/local/config.yaml b/profiles/local/config.yaml index fa632449f..8a5a7aad1 100755 --- a/profiles/local/config.yaml +++ b/profiles/local/config.yaml @@ -1,20 +1,20 @@ verbose: false use-conda: true -resources: gpu=8 -cores: all +resources: gpu=0 +cores: 1 cache: false reason: true config: # install dependencies on a local machine - deps=true # root path to a folder with data, models and logs - - root=/data + - root=/home/tommin/greennlp/data - cuda=/usr/local/cuda - cudnn=/cudnn # how much memory Marian reserves on a GPU - - workspace=12000 + - workspace=2000 # a number of GPUs you want to use - - numgpus=8 + - numgpus=0 - mariancmake="" # (optional) override available GPU ids, example gpus=0 2 5 6 - - gpus="" \ No newline at end of file + - gpus="" diff --git a/profiles/slurm-lumi-test/config.cluster.yaml b/profiles/slurm-lumi-test/config.cluster.yaml new file mode 100755 index 000000000..b5afc816c --- /dev/null +++ b/profiles/slurm-lumi-test/config.cluster.yaml @@ -0,0 +1,7 @@ +# CSC Puhti +single-gpu-partition: dev-g +multi-gpu-partition: dev-g +cpu-partition: debug +cpu-account: project_462000088 +gpu-account: project_462000088 +time-limit: "00:30:00" diff --git a/profiles/slurm-lumi-test/config.yaml b/profiles/slurm-lumi-test/config.yaml new file mode 100755 index 000000000..7a62fb1e8 --- /dev/null +++ b/profiles/slurm-lumi-test/config.yaml @@ -0,0 +1,31 @@ +cluster: "submit.py" +cluster-status: "status.py" +jobscript: "jobscript.sh" +jobs: 1 +restart-times: 0 +immediate-submit: false +verbose: false +max-jobs-per-second: 1 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 60 +rerun-incomplete: true # recomended for cluster submissions +keep-going: false +default-resources: "mem_mb=64000" +use-singularity: true +use-conda: true +resources: gpu=1 +cores: 8 +#cache: false +reason: true + +singularity-args: "--bind $PWD/../data,$PWD/../data/tmp:/tmp --rocm --containall" +config: + - deps=false + #- root=/pfs/lustrep1/scratch/project_462000088/members/niemine1/data + #- rocm=/opt/rocm + - workspace=20000 + - numgpus=1 + - mariancmake="" + - gpus="" + - marianversion="lumi-marian" diff --git a/profiles/slurm-lumi-test/jobscript.sh b/profiles/slurm-lumi-test/jobscript.sh new file mode 100755 index 000000000..53f7e2b59 --- /dev/null +++ b/profiles/slurm-lumi-test/jobscript.sh @@ -0,0 +1,24 @@ +#!/bin/bash -x +# properties = {properties} + +#parse properties json and get log file name +log_file=$(echo '{properties}' | jq -r .log[0]) +gpu=$(echo '{properties}' | jq -r .resources.gpu) + +mkdir -p $(dirname $log_file) + +if [ $gpu != "null" ] && [ $gpu != "0" ]; then + #this will add the header row for the csv file, it will be removed for later log lines + rocm-smi --csv --showuse --showmemuse --showenergycounter 2> /dev/null | head -1 > $log_file.gpu + while true; do + rocm-smi --csv --showuse --showmemuse --showenergycounter 2> /dev/null | \ + grep -v "device" | xargs -I {{}} echo -e "$(date "+%Y-%m-%d_%H:%M:%S")\t{{}}" >> $log_file.gpu; sleep 10; + done & + rocmloop_pid=$! +fi + +{exec_job} + +if [ -z $rocmloop_pid ]; then + kill $rocmloop_pid +fi diff --git a/profiles/slurm-lumi-test/status.py b/profiles/slurm-lumi-test/status.py new file mode 100755 index 000000000..847a9393b --- /dev/null +++ b/profiles/slurm-lumi-test/status.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import logging +import re +import shlex +import subprocess as sp +import sys +import time + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n")) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + logger.error(e) + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}")) + m = re.search(r"JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + sys.exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/profiles/slurm-lumi-test/submit.py b/profiles/slurm-lumi-test/submit.py new file mode 100755 index 000000000..26cad5456 --- /dev/null +++ b/profiles/slurm-lumi-test/submit.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import os +import re +import subprocess as sp +import sys + +import yaml +from snakemake.logging import logger +from snakemake.utils import read_job_properties + +cluster_config_file = os.path.join(os.path.dirname(__file__), "config.cluster.yaml") +cluster_config = yaml.load(open(cluster_config_file), Loader=yaml.FullLoader) + +jobscript = sys.argv[-1] +job_properties = read_job_properties(jobscript) + +options = [] + +if job_properties["type"] == "single": + name = job_properties["rule"] +elif job_properties["type"] == "group": + name = job_properties["groupid"] +else: + raise NotImplementedError( + f"Don't know what to do with job_properties['type']=={job_properties['type']}" + ) + +options += ["--job-name", name] + +partition = cluster_config["cpu-partition"] +account = cluster_config["cpu-account"] + +if "resources" in job_properties: + resources = job_properties["resources"] + + if "gpu" in resources and int(resources["gpu"]) >= 1: + num_gpu = str(resources["gpu"]) + # options += [f'--gres=gpu:v100:{num_gpu}'] + options += ["--gpus=1"] + account = cluster_config["gpu-account"] + + if num_gpu == "1": + partition = cluster_config["single-gpu-partition"] + else: + partition = cluster_config["multi-gpu-partition"] + rocm_dir = os.getenv("ROCM_PATH") + options += ["--export", f'ALL,SINGULARITY_BIND="{rocm_dir}"'] + + # we don't need explicit memory limiting for now + if "mem_mb" in resources: + memory = str(resources["mem_mb"]) + options += [f"--mem={memory}"] + + +options += ["-p", partition] +options += ["-A", account] +options += ["--nodes=1"] +options += ["-t", str(cluster_config["time-limit"])] + +if "threads" in job_properties: + options += ["--cpus-per-task", str(job_properties["threads"])] + +try: + # cmd = ["sbatch"] + ["--parsable"] + options + [f"--wrap=\"/bin/bash -c '{jobscript}'\""] + cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] + logger.info(f"Running command: {cmd}") + res = sp.check_output(cmd) +except sp.CalledProcessError as e: + raise e +# Get jobid +res = res.decode() +try: + jobid = re.search(r"(\d+)", res).group(1) +except Exception as e: + raise e + +print(jobid) diff --git a/profiles/slurm-lumi/config.cluster.yaml b/profiles/slurm-lumi/config.cluster.yaml new file mode 100755 index 000000000..365b82409 --- /dev/null +++ b/profiles/slurm-lumi/config.cluster.yaml @@ -0,0 +1,7 @@ +# CSC Puhti +single-gpu-partition: small-g +multi-gpu-partition: small-g +cpu-partition: small +cpu-account: project_462000088 +gpu-account: project_462000088 +time-limit: "72:00:00" diff --git a/profiles/slurm-lumi/config.yaml b/profiles/slurm-lumi/config.yaml new file mode 100755 index 000000000..d6ffba2b7 --- /dev/null +++ b/profiles/slurm-lumi/config.yaml @@ -0,0 +1,31 @@ +cluster: "submit.py" +cluster-status: "status.py" +jobscript: "jobscript.sh" +jobs: 1 +restart-times: 0 +immediate-submit: false +verbose: false +max-jobs-per-second: 1 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 60 +rerun-incomplete: true # recomended for cluster submissions +keep-going: false +default-resources: "mem_mb=64000" +use-singularity: true +use-conda: true +resources: gpu=1 +cores: 8 +#cache: false +reason: true + +singularity-args: "--bind $ROCM_PATH,$PWD/../data,$PWD/../data/tmp:/tmp --rocm --containall" +config: + - deps=false + #- root=/pfs/lustrep1/scratch/project_462000088/members/niemine1/data + #- rocm=/opt/rocm + - workspace=20000 + - numgpus=1 + - mariancmake="" + - gpus="" + - marianversion="lumi-marian" diff --git a/profiles/slurm-lumi/jobscript.sh b/profiles/slurm-lumi/jobscript.sh new file mode 100755 index 000000000..53f7e2b59 --- /dev/null +++ b/profiles/slurm-lumi/jobscript.sh @@ -0,0 +1,24 @@ +#!/bin/bash -x +# properties = {properties} + +#parse properties json and get log file name +log_file=$(echo '{properties}' | jq -r .log[0]) +gpu=$(echo '{properties}' | jq -r .resources.gpu) + +mkdir -p $(dirname $log_file) + +if [ $gpu != "null" ] && [ $gpu != "0" ]; then + #this will add the header row for the csv file, it will be removed for later log lines + rocm-smi --csv --showuse --showmemuse --showenergycounter 2> /dev/null | head -1 > $log_file.gpu + while true; do + rocm-smi --csv --showuse --showmemuse --showenergycounter 2> /dev/null | \ + grep -v "device" | xargs -I {{}} echo -e "$(date "+%Y-%m-%d_%H:%M:%S")\t{{}}" >> $log_file.gpu; sleep 10; + done & + rocmloop_pid=$! +fi + +{exec_job} + +if [ -z $rocmloop_pid ]; then + kill $rocmloop_pid +fi diff --git a/profiles/slurm-lumi/status.py b/profiles/slurm-lumi/status.py new file mode 100755 index 000000000..847a9393b --- /dev/null +++ b/profiles/slurm-lumi/status.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import logging +import re +import shlex +import subprocess as sp +import sys +import time + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n")) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + logger.error(e) + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}")) + m = re.search(r"JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + sys.exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/profiles/slurm-lumi/submit.py b/profiles/slurm-lumi/submit.py new file mode 100755 index 000000000..26cad5456 --- /dev/null +++ b/profiles/slurm-lumi/submit.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import os +import re +import subprocess as sp +import sys + +import yaml +from snakemake.logging import logger +from snakemake.utils import read_job_properties + +cluster_config_file = os.path.join(os.path.dirname(__file__), "config.cluster.yaml") +cluster_config = yaml.load(open(cluster_config_file), Loader=yaml.FullLoader) + +jobscript = sys.argv[-1] +job_properties = read_job_properties(jobscript) + +options = [] + +if job_properties["type"] == "single": + name = job_properties["rule"] +elif job_properties["type"] == "group": + name = job_properties["groupid"] +else: + raise NotImplementedError( + f"Don't know what to do with job_properties['type']=={job_properties['type']}" + ) + +options += ["--job-name", name] + +partition = cluster_config["cpu-partition"] +account = cluster_config["cpu-account"] + +if "resources" in job_properties: + resources = job_properties["resources"] + + if "gpu" in resources and int(resources["gpu"]) >= 1: + num_gpu = str(resources["gpu"]) + # options += [f'--gres=gpu:v100:{num_gpu}'] + options += ["--gpus=1"] + account = cluster_config["gpu-account"] + + if num_gpu == "1": + partition = cluster_config["single-gpu-partition"] + else: + partition = cluster_config["multi-gpu-partition"] + rocm_dir = os.getenv("ROCM_PATH") + options += ["--export", f'ALL,SINGULARITY_BIND="{rocm_dir}"'] + + # we don't need explicit memory limiting for now + if "mem_mb" in resources: + memory = str(resources["mem_mb"]) + options += [f"--mem={memory}"] + + +options += ["-p", partition] +options += ["-A", account] +options += ["--nodes=1"] +options += ["-t", str(cluster_config["time-limit"])] + +if "threads" in job_properties: + options += ["--cpus-per-task", str(job_properties["threads"])] + +try: + # cmd = ["sbatch"] + ["--parsable"] + options + [f"--wrap=\"/bin/bash -c '{jobscript}'\""] + cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] + logger.info(f"Running command: {cmd}") + res = sp.check_output(cmd) +except sp.CalledProcessError as e: + raise e +# Get jobid +res = res.decode() +try: + jobid = re.search(r"(\d+)", res).group(1) +except Exception as e: + raise e + +print(jobid) diff --git a/profiles/slurm-mahti-test/config.cluster.yaml b/profiles/slurm-mahti-test/config.cluster.yaml new file mode 100755 index 000000000..0e7ea001a --- /dev/null +++ b/profiles/slurm-mahti-test/config.cluster.yaml @@ -0,0 +1,7 @@ +# CSC Puhti +single-gpu-partition: gputest +multi-gpu-partition: gputest +cpu-partition: test +cpu-account: project_2007095 +gpu-account: project_2007095 +time-limit: "00:15:00" diff --git a/profiles/slurm-mahti-test/config.yaml b/profiles/slurm-mahti-test/config.yaml new file mode 100755 index 000000000..754cd98f7 --- /dev/null +++ b/profiles/slurm-mahti-test/config.yaml @@ -0,0 +1,33 @@ +cluster: "submit.py" +cluster-status: "status.py" +jobscript: "jobscript.sh" +jobs: 1 +restart-times: 0 +immediate-submit: false +verbose: true +max-jobs-per-second: 1 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 60 +rerun-incomplete: true # recomended for cluster submissions +keep-going: false +default-resources: ["mem_mb=64000"] +use-singularity: true +use-conda: true +resources: gpu=1 +cores: 8 +cache: false +reason: true +# if CPU nodes don't have access to cuda dirs, use +# export CUDA_DIR=$(CUDA_DIR); $(SNAKEMAKE) ... +# singularity-args: "--bind $(SHARED_ROOT),/tmp --nv --containall" +singularity-args: "--bind /scratch/project_2007095/tommi/data,$CUDA_INSTALL_ROOT,$CUDNN_INSTALL_ROOT:/cudnn,/scratch/project_2007095/tommi/data/temp:/tmp --nv --containall" +config: + - deps=false + - root=/scratch/project_2007095/tommi/data + - cuda=/appl/spack/v017/install-tree/gcc-11.2.0/cuda-11.5.0-mg4ztb/ + - cudnn=/appl/spack/v017/install-tree/gcc-11.2.0/cudnn-8.3.3.40-11.5-crjjbv/ + - workspace=10000 + - numgpus=1 + - mariancmake="" + - gpus="0" diff --git a/profiles/slurm-mahti-test/jobscript.sh b/profiles/slurm-mahti-test/jobscript.sh new file mode 100755 index 000000000..f11010a52 --- /dev/null +++ b/profiles/slurm-mahti-test/jobscript.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# properties = {properties} + +#parse properties json and get log file name +log_file=$(echo '{properties}' | jq -r .log[0]) + +export APPTAINERENV_CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES + +if command -v nvidia-smi &> /dev/null +then + nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,power.draw,memory.total,memory.free,memory.used --format=csv -l 1 > $log_file.gpu & +fi + +{exec_job} + +if command -v nvidia-smi &> /dev/null +then + kill %1 + gzip $log_file.gpu +fi diff --git a/profiles/slurm-mahti-test/status.py b/profiles/slurm-mahti-test/status.py new file mode 100755 index 000000000..847a9393b --- /dev/null +++ b/profiles/slurm-mahti-test/status.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import logging +import re +import shlex +import subprocess as sp +import sys +import time + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n")) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + logger.error(e) + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}")) + m = re.search(r"JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + sys.exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/profiles/slurm-mahti-test/submit.py b/profiles/slurm-mahti-test/submit.py new file mode 100755 index 000000000..7989e77aa --- /dev/null +++ b/profiles/slurm-mahti-test/submit.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import os +import re +import subprocess as sp +import sys + +import yaml +from snakemake.logging import logger +from snakemake.utils import read_job_properties + +cluster_config_file = os.path.join(os.path.dirname(__file__), "config.cluster.yaml") +cluster_config = yaml.load(open(cluster_config_file), Loader=yaml.FullLoader) + +jobscript = sys.argv[-1] +job_properties = read_job_properties(jobscript) + +options = [] + +if job_properties["type"] == "single": + name = job_properties["rule"] +elif job_properties["type"] == "group": + name = job_properties["groupid"] +else: + raise NotImplementedError( + f"Don't know what to do with job_properties['type']=={job_properties['type']}" + ) + +options += ["--job-name", name] + +partition = cluster_config["cpu-partition"] +account = cluster_config["cpu-account"] + +if "resources" in job_properties: + resources = job_properties["resources"] + + if "gpu" in resources and int(resources["gpu"]) >= 1: + num_gpu = str(resources["gpu"]) + options += [f"--gres=gpu:a100:{num_gpu}"] + account = cluster_config["gpu-account"] + + if num_gpu == "1": + partition = cluster_config["single-gpu-partition"] + else: + partition = cluster_config["multi-gpu-partition"] + + # we don't need explicit memory limiting for now + if "mem_mb" in resources: + memory = str(resources["mem_mb"]) + options += [f"--mem={memory}"] + +cuda_dir = os.getenv("CUDA_DIR") +if cuda_dir: + options += ["--export", f'ALL,SINGULARITY_BIND="{cuda_dir}"'] + +options += ["-p", partition] +options += ["-A", account] +options += ["--nodes=1"] +options += ["-t", str(cluster_config["time-limit"])] + +if "threads" in job_properties: + options += ["--cpus-per-task", str(job_properties["threads"])] + +try: + # cmd = ["sbatch"] + ["--parsable"] + options + [f"--wrap=\"/bin/bash -c '{jobscript}'\""] + cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] + logger.info(f"Running command: {cmd}") + res = sp.check_output(cmd) +except sp.CalledProcessError as e: + raise e +# Get jobid +res = res.decode() +try: + jobid = re.search(r"(\d+)", res).group(1) +except Exception as e: + raise e + +print(jobid) diff --git a/profiles/slurm-mahti/config.cluster.yaml b/profiles/slurm-mahti/config.cluster.yaml new file mode 100755 index 000000000..de972ce83 --- /dev/null +++ b/profiles/slurm-mahti/config.cluster.yaml @@ -0,0 +1,7 @@ +# CSC Mahti +single-gpu-partition: gpusmall +multi-gpu-partition: gpumedium +cpu-partition: medium +cpu-account: project_2007095 +gpu-account: project_2007095 +time-limit: "00:72:00" diff --git a/profiles/slurm-mahti/config.yaml b/profiles/slurm-mahti/config.yaml new file mode 100755 index 000000000..8a7ce086a --- /dev/null +++ b/profiles/slurm-mahti/config.yaml @@ -0,0 +1,33 @@ +cluster: "submit.py" +cluster-status: "status.py" +jobscript: "jobscript.sh" +jobs: 8 +restart-times: 0 +immediate-submit: false +verbose: false +max-jobs-per-second: 1 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 60 +rerun-incomplete: true # recomended for cluster submissions +keep-going: false +default-resources: ["mem_mb=64000"] +use-singularity: true +use-conda: true +resources: gpu=1 +cores: 8 +cache: false +reason: true +# if CPU nodes don't have access to cuda dirs, use +# export CUDA_DIR=$(CUDA_DIR); $(SNAKEMAKE) ... +# singularity-args: "--bind $(SHARED_ROOT),/tmp --nv --containall" +singularity-args: "--bind /scratch/project_2007095/tommi/data,$CUDA_INSTALL_ROOT,$CUDNN_INSTALL_ROOT:/cudnn,/scratch/project_2007095/tommi/data/temp:/tmp --nv --containall" +config: + - deps=false + - root=/scratch/project_2007095/tommi/data + - cuda=/appl/spack/v017/install-tree/gcc-11.2.0/cuda-11.5.0-mg4ztb/ + - cudnn=/appl/spack/v017/install-tree/gcc-11.2.0/cudnn-8.3.3.40-11.5-crjjbv/ + - workspace=10000 + - numgpus=1 + - mariancmake="" + - gpus="0" diff --git a/profiles/slurm-mahti/jobscript.sh b/profiles/slurm-mahti/jobscript.sh new file mode 100755 index 000000000..256fbc94b --- /dev/null +++ b/profiles/slurm-mahti/jobscript.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# properties = {properties} + +#parse properties json and get log file name +log_file=$(echo '{properties}' | jq -r .log[0]) + +export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES + +if command -v nvidia-smi &> /dev/null +then + nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,power.draw,memory.total,memory.free,memory.used --format=csv -l 1 > $log_file.gpu & +fi + +{exec_job} + +if command -v nvidia-smi &> /dev/null +then + kill %1 + gzip $log_file.gpu +fi diff --git a/profiles/slurm-mahti/status.py b/profiles/slurm-mahti/status.py new file mode 100755 index 000000000..847a9393b --- /dev/null +++ b/profiles/slurm-mahti/status.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import logging +import re +import shlex +import subprocess as sp +import sys +import time + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n")) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + logger.error(e) + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}")) + m = re.search(r"JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + sys.exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/profiles/slurm-mahti/submit.py b/profiles/slurm-mahti/submit.py new file mode 100755 index 000000000..7989e77aa --- /dev/null +++ b/profiles/slurm-mahti/submit.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import os +import re +import subprocess as sp +import sys + +import yaml +from snakemake.logging import logger +from snakemake.utils import read_job_properties + +cluster_config_file = os.path.join(os.path.dirname(__file__), "config.cluster.yaml") +cluster_config = yaml.load(open(cluster_config_file), Loader=yaml.FullLoader) + +jobscript = sys.argv[-1] +job_properties = read_job_properties(jobscript) + +options = [] + +if job_properties["type"] == "single": + name = job_properties["rule"] +elif job_properties["type"] == "group": + name = job_properties["groupid"] +else: + raise NotImplementedError( + f"Don't know what to do with job_properties['type']=={job_properties['type']}" + ) + +options += ["--job-name", name] + +partition = cluster_config["cpu-partition"] +account = cluster_config["cpu-account"] + +if "resources" in job_properties: + resources = job_properties["resources"] + + if "gpu" in resources and int(resources["gpu"]) >= 1: + num_gpu = str(resources["gpu"]) + options += [f"--gres=gpu:a100:{num_gpu}"] + account = cluster_config["gpu-account"] + + if num_gpu == "1": + partition = cluster_config["single-gpu-partition"] + else: + partition = cluster_config["multi-gpu-partition"] + + # we don't need explicit memory limiting for now + if "mem_mb" in resources: + memory = str(resources["mem_mb"]) + options += [f"--mem={memory}"] + +cuda_dir = os.getenv("CUDA_DIR") +if cuda_dir: + options += ["--export", f'ALL,SINGULARITY_BIND="{cuda_dir}"'] + +options += ["-p", partition] +options += ["-A", account] +options += ["--nodes=1"] +options += ["-t", str(cluster_config["time-limit"])] + +if "threads" in job_properties: + options += ["--cpus-per-task", str(job_properties["threads"])] + +try: + # cmd = ["sbatch"] + ["--parsable"] + options + [f"--wrap=\"/bin/bash -c '{jobscript}'\""] + cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] + logger.info(f"Running command: {cmd}") + res = sp.check_output(cmd) +except sp.CalledProcessError as e: + raise e +# Get jobid +res = res.decode() +try: + jobid = re.search(r"(\d+)", res).group(1) +except Exception as e: + raise e + +print(jobid) diff --git a/profiles/slurm-moz/config.yaml b/profiles/slurm-moz/config.yaml index 85d3928d1..18ff4ffed 100755 --- a/profiles/slurm-moz/config.yaml +++ b/profiles/slurm-moz/config.yaml @@ -15,6 +15,7 @@ default-resources: "mem_mb=5980" use-singularity: true use-conda: true resources: gpu=8 +cluster-cancel: scancel cores: 48 cache: false reason: true @@ -30,4 +31,4 @@ config: - workspace=8000 - numgpus=8 - mariancmake="" - - gpus="" \ No newline at end of file + - gpus="" diff --git a/profiles/slurm-puhti-test/.config.cluster.yaml.swp b/profiles/slurm-puhti-test/.config.cluster.yaml.swp new file mode 100644 index 000000000..c0ec335cd Binary files /dev/null and b/profiles/slurm-puhti-test/.config.cluster.yaml.swp differ diff --git a/profiles/slurm-puhti-test/config.cluster.yaml b/profiles/slurm-puhti-test/config.cluster.yaml new file mode 100755 index 000000000..15422298d --- /dev/null +++ b/profiles/slurm-puhti-test/config.cluster.yaml @@ -0,0 +1,7 @@ +# CSC Puhti +single-gpu-partition: gputest +multi-gpu-partition: gputest +cpu-partition: test +cpu-account: project_2006944 +gpu-account: project_2006944 +time-limit: "00:15:00" diff --git a/profiles/slurm-puhti-test/config.yaml b/profiles/slurm-puhti-test/config.yaml new file mode 100755 index 000000000..8ad26441c --- /dev/null +++ b/profiles/slurm-puhti-test/config.yaml @@ -0,0 +1,31 @@ +cluster: "submit.py" +cluster-status: "status.py" +jobscript: "jobscript.sh" +jobs: 1 +restart-times: 0 +immediate-submit: false +verbose: false +max-jobs-per-second: 1 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 60 +rerun-incomplete: true # recomended for cluster submissions +keep-going: false +default-resources: "mem_mb=64000" +use-singularity: true +use-conda: true +resources: gpu=1 +cores: 8 +cache: false +reason: true +singularity-args: "--bind $PWD/../data,$CUDA_INSTALL_ROOT,$CUDNN_INSTALL_ROOT,$PWD/../data/tmp:/tmp --nv --containall" +config: + - deps=false + # These config values are now set in Snakefile, default values can be overwritten here + #- root=/scratch/project_2006944/multiteacher_pipeline/data + #- cuda=/appl/spack/v018/install-tree/gcc-9.4.0/cuda-11.1.1-lfaa3j + #- cudnn=/appl/spack/v018/install-tree/gcc-9.4.0/cudnn-8.0.5.39-11.1-5cnrmz + - workspace=10000 + - numgpus=1 + - mariancmake="" + - gpus="" diff --git a/profiles/slurm-puhti-test/jobscript.sh b/profiles/slurm-puhti-test/jobscript.sh new file mode 100755 index 000000000..b85147b1f --- /dev/null +++ b/profiles/slurm-puhti-test/jobscript.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set +x +# properties = {properties} + +#parse properties json and get log file name +log_file=$(echo '{properties}' | jq -r .log[0]) +gpu=$(echo '{properties}' | jq -r .resources.gpu) + +export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES + +if [ $gpu != "null" ] && [ $gpu != "0" ]; then + nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,power.draw,memory.total,memory.free,memory.used --format=csv -l 10 > $log_file.gpu & + nvidiasmi_pid=$! + #/appl/soft/ai/bin/gpu-energy + #monitor_pid=$! +fi + +{exec_job} + +if [ -z $nvidiasmi_pid ]; then + kill $nvidiasmi_pid +fi +if [ -z $monitor_pid ]; then + kill -SIGUSR1 $monitor_pid +fi diff --git a/profiles/slurm-puhti-test/status.py b/profiles/slurm-puhti-test/status.py new file mode 100755 index 000000000..847a9393b --- /dev/null +++ b/profiles/slurm-puhti-test/status.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import logging +import re +import shlex +import subprocess as sp +import sys +import time + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n")) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + logger.error(e) + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}")) + m = re.search(r"JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + sys.exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/profiles/slurm-puhti-test/submit.py b/profiles/slurm-puhti-test/submit.py new file mode 100755 index 000000000..e727bc429 --- /dev/null +++ b/profiles/slurm-puhti-test/submit.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import os +import re +import subprocess as sp +import sys + +import yaml +from snakemake.logging import logger +from snakemake.utils import read_job_properties + +cluster_config_file = os.path.join(os.path.dirname(__file__), "config.cluster.yaml") +cluster_config = yaml.load(open(cluster_config_file), Loader=yaml.FullLoader) + +jobscript = sys.argv[-1] +job_properties = read_job_properties(jobscript) + +options = [] + +if job_properties["type"] == "single": + name = job_properties["rule"] +elif job_properties["type"] == "group": + name = job_properties["groupid"] +else: + raise NotImplementedError( + f"Don't know what to do with job_properties['type']=={job_properties['type']}" + ) + +options += ["--job-name", name] + +partition = cluster_config["cpu-partition"] +account = cluster_config["cpu-account"] + +if "resources" in job_properties: + resources = job_properties["resources"] + + if "gpu" in resources and int(resources["gpu"]) >= 1: + num_gpu = str(resources["gpu"]) + options += [f"--gres=gpu:v100:{num_gpu}"] + account = cluster_config["gpu-account"] + + if num_gpu == "1": + partition = cluster_config["single-gpu-partition"] + else: + partition = cluster_config["multi-gpu-partition"] + + # we don't need explicit memory limiting for now + if "mem_mb" in resources: + memory = str(resources["mem_mb"]) + options += [f"--mem={memory}"] + +cuda_dir = os.getenv("CUDA_DIR") +if cuda_dir: + options += ["--export", f'ALL,SINGULARITY_BIND="{cuda_dir}"'] + +options += ["-p", partition] +options += ["-A", account] +options += ["--nodes=1"] +options += ["-t", str(cluster_config["time-limit"])] + +if "threads" in job_properties: + options += ["--cpus-per-task", str(job_properties["threads"])] + +try: + # cmd = ["sbatch"] + ["--parsable"] + options + [f"--wrap=\"/bin/bash -c '{jobscript}'\""] + cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] + logger.info(f"Running command: {cmd}") + res = sp.check_output(cmd) +except sp.CalledProcessError as e: + raise e +# Get jobid +res = res.decode() +try: + jobid = re.search(r"(\d+)", res).group(1) +except Exception as e: + raise e + +print(jobid) diff --git a/profiles/slurm-puhti/config.cluster.yaml b/profiles/slurm-puhti/config.cluster.yaml new file mode 100755 index 000000000..8d07aacb0 --- /dev/null +++ b/profiles/slurm-puhti/config.cluster.yaml @@ -0,0 +1,7 @@ +# CSC Puhti +single-gpu-partition: gpu +multi-gpu-partition: gpu +cpu-partition: small +cpu-account: project_2006944 +gpu-account: project_2006944 +time-limit: "06:00:00" diff --git a/profiles/slurm-puhti/config.yaml b/profiles/slurm-puhti/config.yaml new file mode 100755 index 000000000..02f218ff4 --- /dev/null +++ b/profiles/slurm-puhti/config.yaml @@ -0,0 +1,31 @@ +cluster: "submit.py" +cluster-status: "status.py" +jobscript: "jobscript.sh" +jobs: 8 +restart-times: 0 +immediate-submit: false +verbose: false +max-jobs-per-second: 1 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 60 +rerun-incomplete: true # recomended for cluster submissions +keep-going: false +default-resources: "mem_mb=64000" +use-singularity: true +use-conda: true +resources: gpu=1 +cores: 8 +cache: false +reason: true +singularity-args: "--bind $PWD/../data,$CUDA_INSTALL_ROOT,$CUDNN_INSTALL_ROOT,$PWD/../data/tmp:/tmp --nv --containall" +config: + - deps=false + # These config values are now set in Snakefile, default values can be overwritten here + #- root=/scratch/project_2006944/multiteacher_pipeline/data + #- cuda=/appl/spack/v018/install-tree/gcc-9.4.0/cuda-11.1.1-lfaa3j + #- cudnn=/appl/spack/v018/install-tree/gcc-9.4.0/cudnn-8.0.5.39-11.1-5cnrmz + - workspace=10000 + - numgpus=1 + - mariancmake="" + - gpus="" diff --git a/profiles/slurm-puhti/jobscript.sh b/profiles/slurm-puhti/jobscript.sh new file mode 100755 index 000000000..b85147b1f --- /dev/null +++ b/profiles/slurm-puhti/jobscript.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set +x +# properties = {properties} + +#parse properties json and get log file name +log_file=$(echo '{properties}' | jq -r .log[0]) +gpu=$(echo '{properties}' | jq -r .resources.gpu) + +export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES + +if [ $gpu != "null" ] && [ $gpu != "0" ]; then + nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,power.draw,memory.total,memory.free,memory.used --format=csv -l 10 > $log_file.gpu & + nvidiasmi_pid=$! + #/appl/soft/ai/bin/gpu-energy + #monitor_pid=$! +fi + +{exec_job} + +if [ -z $nvidiasmi_pid ]; then + kill $nvidiasmi_pid +fi +if [ -z $monitor_pid ]; then + kill -SIGUSR1 $monitor_pid +fi diff --git a/profiles/slurm-puhti/status.py b/profiles/slurm-puhti/status.py new file mode 100755 index 000000000..847a9393b --- /dev/null +++ b/profiles/slurm-puhti/status.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import logging +import re +import shlex +import subprocess as sp +import sys +import time + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n")) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + logger.error(e) + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}")) + m = re.search(r"JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + sys.exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/profiles/slurm-puhti/submit.py b/profiles/slurm-puhti/submit.py new file mode 100755 index 000000000..e727bc429 --- /dev/null +++ b/profiles/slurm-puhti/submit.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import os +import re +import subprocess as sp +import sys + +import yaml +from snakemake.logging import logger +from snakemake.utils import read_job_properties + +cluster_config_file = os.path.join(os.path.dirname(__file__), "config.cluster.yaml") +cluster_config = yaml.load(open(cluster_config_file), Loader=yaml.FullLoader) + +jobscript = sys.argv[-1] +job_properties = read_job_properties(jobscript) + +options = [] + +if job_properties["type"] == "single": + name = job_properties["rule"] +elif job_properties["type"] == "group": + name = job_properties["groupid"] +else: + raise NotImplementedError( + f"Don't know what to do with job_properties['type']=={job_properties['type']}" + ) + +options += ["--job-name", name] + +partition = cluster_config["cpu-partition"] +account = cluster_config["cpu-account"] + +if "resources" in job_properties: + resources = job_properties["resources"] + + if "gpu" in resources and int(resources["gpu"]) >= 1: + num_gpu = str(resources["gpu"]) + options += [f"--gres=gpu:v100:{num_gpu}"] + account = cluster_config["gpu-account"] + + if num_gpu == "1": + partition = cluster_config["single-gpu-partition"] + else: + partition = cluster_config["multi-gpu-partition"] + + # we don't need explicit memory limiting for now + if "mem_mb" in resources: + memory = str(resources["mem_mb"]) + options += [f"--mem={memory}"] + +cuda_dir = os.getenv("CUDA_DIR") +if cuda_dir: + options += ["--export", f'ALL,SINGULARITY_BIND="{cuda_dir}"'] + +options += ["-p", partition] +options += ["-A", account] +options += ["--nodes=1"] +options += ["-t", str(cluster_config["time-limit"])] + +if "threads" in job_properties: + options += ["--cpus-per-task", str(job_properties["threads"])] + +try: + # cmd = ["sbatch"] + ["--parsable"] + options + [f"--wrap=\"/bin/bash -c '{jobscript}'\""] + cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] + logger.info(f"Running command: {cmd}") + res = sp.check_output(cmd) +except sp.CalledProcessError as e: + raise e +# Get jobid +res = res.decode() +try: + jobid = re.search(r"(\d+)", res).group(1) +except Exception as e: + raise e + +print(jobid) diff --git a/taskcluster/ci/merge-corpus/kind.yml b/taskcluster/ci/merge-corpus/kind.yml index d04e968ec..ac193d34a 100644 --- a/taskcluster/ci/merge-corpus/kind.yml +++ b/taskcluster/ci/merge-corpus/kind.yml @@ -38,6 +38,7 @@ task-defaults: from-parameters: src_locale: training_config.experiment.src trg_locale: training_config.experiment.trg + parallel_max_sentences: training_config.experiment.parallel-max-sentences substitution-fields: - name - label @@ -73,13 +74,11 @@ task-defaults: command: - bash - -c - # Arguments are: - # 1) output prefix - # 2) input files - >- export BIN=$MOZ_FETCHES_DIR && $VCS_PATH/pipeline/clean/merge-corpus.sh artifacts/{artifact_prefix} + {parallel_max_sentences} $MOZ_FETCHES_DIR/*.zst fetches: toolchain: diff --git a/taskcluster/ci/merge-devset/kind.yml b/taskcluster/ci/merge-devset/kind.yml index 0ab56224f..3975bdf52 100644 --- a/taskcluster/ci/merge-devset/kind.yml +++ b/taskcluster/ci/merge-devset/kind.yml @@ -73,13 +73,11 @@ task-defaults: command: - bash - -c - # Arguments are: - # 1) output prefix - # 2) input files - >- export BIN=$MOZ_FETCHES_DIR && $VCS_PATH/pipeline/clean/merge-corpus.sh artifacts/{artifact_prefix} + inf $MOZ_FETCHES_DIR/*.zst fetches: toolchain: diff --git a/taskcluster/ci/score/kind.yml b/taskcluster/ci/score/kind.yml index d31bb2eb7..952940850 100644 --- a/taskcluster/ci/score/kind.yml +++ b/taskcluster/ci/score/kind.yml @@ -77,7 +77,8 @@ tasks: $VCS_PATH/pipeline/cefilter/score.sh fetches/final.model.npz.best-{best_model}.npz fetches/vocab.spm - fetches/corpus + fetches/corpus.{src_locale}.zst + fetches/corpus.{trg_locale}.zst artifacts/scores.txt dependencies: diff --git a/taskcluster/ci/tests/kind.yml b/taskcluster/ci/tests/kind.yml index 23e0d3fc3..04f774c0e 100644 --- a/taskcluster/ci/tests/kind.yml +++ b/taskcluster/ci/tests/kind.yml @@ -41,9 +41,14 @@ tasks: make snakemake && make git-modules && + echo "Start the test dry run" && + make test-dry-run && echo "Start the dry run" && make dry-run && - make test-dry-run + echo "Start the opusmt test dry run" && + CONFIG=configs/config.opusmt-test.yml make dry-run && + echo "Start the opusmt dry run" && + CONFIG=configs/config.opusmt.yml make dry-run black: # Run python's black formatter, which formats python files. diff --git a/taskcluster/ci/toolchain/kind.yml b/taskcluster/ci/toolchain/kind.yml index bfe8bdfd1..94f8f34c9 100644 --- a/taskcluster/ci/toolchain/kind.yml +++ b/taskcluster/ci/toolchain/kind.yml @@ -40,7 +40,7 @@ tasks: script: build-marian.sh resources: - taskcluster/scripts/toolchain/build-marian.sh - - pipeline/setup/compile-marian.sh + - pipeline/setup/compile-marian-dev.sh toolchain-artifact: public/build/marian.tar.zst fetches: fetch: @@ -58,7 +58,7 @@ tasks: resources: - taskcluster/scripts/toolchain/build-marian.sh - taskcluster/scripts/toolchain/browsermt.patch - - pipeline/setup/compile-marian.sh + - pipeline/setup/compile-marian-dev.sh toolchain-artifact: public/build/marian.tar.zst fetches: fetch: diff --git a/taskcluster/scripts/toolchain/build-marian.sh b/taskcluster/scripts/toolchain/build-marian.sh index 367fcfe3a..cd5db73c3 100755 --- a/taskcluster/scripts/toolchain/build-marian.sh +++ b/taskcluster/scripts/toolchain/build-marian.sh @@ -16,7 +16,7 @@ if [ "$patch" != "none" ]; then fi # TODO: consider not calling out to this since it's such a simple script... -bash $VCS_PATH/pipeline/setup/compile-marian.sh "${MARIAN_DIR}/build" "$(nproc)" +bash $VCS_PATH/pipeline/setup/compile-marian-dev.sh "${MARIAN_DIR}/build" "$(nproc)" cd $MARIAN_DIR/build tar -cf $UPLOAD_DIR/marian.tar \ diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py index 02488a756..4a9335f4b 100644 --- a/taskcluster/translations_taskgraph/parameters.py +++ b/taskcluster/translations_taskgraph/parameters.py @@ -137,6 +137,7 @@ def get_defaults(_): Required("vocab"): str, Required("mono-max-sentences-trg"): int, Required("mono-max-sentences-src"): int, + Optional("parallel-max-sentences"): int, Required("split-length"): int, Required("spm-sample-size"): int, Optional("spm-vocab-size"): int,