Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gabriel opusmt2 #202

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rd_party/marian-dev
Submodule marian-dev updated 150 files
156 changes: 156 additions & 0 deletions Ftt.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
Bootstrap: docker
From: condaforge/mambaforge:22.11.1-4
Stage: spython-base

%files
pipeline/setup/install-deps.sh install-deps.sh
envs/base.yml /conda-envs/c7aefb385f47824bd83e494ba6175afb/environment.yaml
envs/bicleaner-ai-lumi.yml /conda-envs/6dc32b6f0731acf817ada219622a98b8/environment.yaml
envs/bicleaner-ai.yml /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/environment.yaml
envs/bicleaner.yml /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/environment.yaml
envs/corpus.yml /conda-envs/2e8e4401e9abbca04941f823d00fe74a/environment.yaml
envs/tensorboard.yml /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221/environment.yaml
%labels
io.github.snakemake.containerized="true"
io.github.snakemake.conda_env_hash="41592307ee99833c1ad2068c1e915ff9c38acc418b5bebfe7e107d9a79980cb4"
%post

# Remove this if not in Finland, or change to closer mirror
cat /etc/apt/sources.list | sed "s/archive.ubuntu.com/mirrors.nic.funet.fi/g" > temp && mv temp /etc/apt/sources.list

apt-get update && apt-get -y install gcc g++ curl

export DEBIAN_FRONTEND=noninteractive

bash install-deps.sh

# Step 1: Retrieve conda environments

# Conda environment:
# source: envs/base.yml
# prefix: /conda-envs/c7aefb385f47824bd83e494ba6175afb
# name: bergamot-training
# channels:
# - conda-forge
# - defaults
# dependencies:
# - python=3.9
# - cmake=3.21.1
# - pip=21.2.2
# - pip:
# - sacrebleu==2.0.0
# - mtdata==0.4.0
# - fasttext==0.9.2
# - regex==2019.8.19
# - sacremoses==0.0.43
mkdir -p /conda-envs/c7aefb385f47824bd83e494ba6175afb

# Conda environment:
# source: envs/bicleaner-ai-lumi.yml
# prefix: /conda-envs/6dc32b6f0731acf817ada219622a98b8
# name: bicleaner-ai
# channels:
# - conda-forge
# - defaults
# dependencies:
# - python=3.9
# - pip==21.2.2
# - cmake=3.21.1
# - pip:
# - bicleaner-ai==2.2.1
# - tensorflow-rocm==2.10.0.520
mkdir -p /conda-envs/6dc32b6f0731acf817ada219622a98b8

# Conda environment:
# source: envs/bicleaner-ai.yml
# prefix: /conda-envs/04b8248cb528961ad452c73dd0a7c8b6
# name: bicleaner-ai
# channels:
# - conda-forge
# - defaults
# dependencies:
# - python=3.9
# - pip==21.2.2
# - cmake=3.21.1
# - pip:
# - bicleaner-ai==2.2.1
# - tensorflow==2.6.5
mkdir -p /conda-envs/04b8248cb528961ad452c73dd0a7c8b6

# Conda environment:
# source: envs/bicleaner.yml
# prefix: /conda-envs/a4f700aa6ff0256dcd9321b536a081ac
# name: bicleaner
# channels:
# - conda-forge
# - bitextor
# - defaults
# dependencies:
# - python=3.8
# - pip==23.0
# - cmake=3.21.1
# - hunspell==1.7.0
# - pip:
# - pypi-kenlm
# - bicleaner==0.16.1
mkdir -p /conda-envs/a4f700aa6ff0256dcd9321b536a081ac

# Conda environment:
# source: envs/corpus.yml
# prefix: /conda-envs/2e8e4401e9abbca04941f823d00fe74a
# name: corpus
# channels:
# - conda-forge
# - defaults
# dependencies:
# - python=3.9
# - pip=21.2.2
# - pip:
# - sacrebleu==2.0.0
# - mtdata==0.3.2
# - requests==2.26.0
mkdir -p /conda-envs/2e8e4401e9abbca04941f823d00fe74a

# Conda environment:
# source: envs/tensorboard.yml
# prefix: /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221
# name: tensorboard
# channels:
# - conda-forge
# - defaults
# dependencies:
# - python=3.9
# - cmake=3.21.1
# - pip=21.2.2
# - pip:
# - tensorboard==2.5.0
# - tensorboardX==2.2
# - click==8.0.1
# - toolz==0.11.1
mkdir -p /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221

# Step 2: Generate conda environments

mamba env create --prefix /conda-envs/c7aefb385f47824bd83e494ba6175afb --file /conda-envs/c7aefb385f47824bd83e494ba6175afb/environment.yaml && \
mamba env create --prefix /conda-envs/6dc32b6f0731acf817ada219622a98b8 --file /conda-envs/6dc32b6f0731acf817ada219622a98b8/environment.yaml && \
mamba env create --prefix /conda-envs/04b8248cb528961ad452c73dd0a7c8b6 --file /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/environment.yaml && \
mamba env create --prefix /conda-envs/a4f700aa6ff0256dcd9321b536a081ac --file /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/environment.yaml && \
mamba env create --prefix /conda-envs/2e8e4401e9abbca04941f823d00fe74a --file /conda-envs/2e8e4401e9abbca04941f823d00fe74a/environment.yaml && \
mamba env create --prefix /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221 --file /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221/environment.yaml && \
mamba clean --all -y

#Bicleaner needs the fasttext language id model installed
wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
cp lid.176.bin /conda-envs/6dc32b6f0731acf817ada219622a98b8/lib/python3.9/site-packages/fastspell/lid.176.bin
cp lid.176.bin /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/lib/python3.8/site-packages/fastspell/lid.176.bin
cp lid.176.bin /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/lib/python3.9/site-packages/fastspell/lid.176.bin

#Fastspell (used in bicleaner) uses hunspell to disambiguate between similar languages, install all hunspell dictionaries for that
wget -O fastspell_dictionaries.tgz https://github.com/mbanon/fastspell/releases/download/dictionaries_v1/fastspell_dictionaries.tgz
mkdir -p /usr/share/hunspell
tar -xf fastspell_dictionaries.tgz --directory /usr/share/hunspell

%runscript
exec /bin/bash "$@"
%startscript
exec /bin/bash "$@"
14 changes: 14 additions & 0 deletions InstallSnakemakeEnvs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from os import listdir
def get_envs(wildcards):
return [x.replace(".yml",".done") for x in os.listdir("envs") if x.endswith(".yml")]

container: 'Ftt.sif'

rule all:
input: get_envs

rule make_envs:
conda: 'envs/{env}.yml'
output: '{env}.done'
shell: f'touch {{output}}'

44 changes: 41 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ PROFILE?=local
# execution rule or path to rule output, default is all
TARGET=
REPORTS?=../reports
EXTRA=
# for tensorboard
MODELS?=../models

Expand All @@ -30,9 +31,24 @@ conda:

snakemake:
$(CONDA_ACTIVATE) base
mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.12.2 tabulate==0.8.10 --yes
mamba create -c conda-forge -c bioconda -n snakemake snakemake==7.19.1 tabulate==0.8.10 --yes
mkdir -p "$(SNAKEMAKE_OUTPUT_CACHE)"


containerize:
$(CONDA_ACTIVATE) snakemake
$(SNAKEMAKE) \
--profile=profiles/$(PROFILE) \
--configfile $(CONFIG) \
--containerize > Dockerfile
spython recipe Dockerfile Ftt.def
sed -i "s|%files|%files\npipeline/setup/install-deps.sh install-deps.sh|" Ftt.def
sed -i 's#%post#%post\ncat /etc/apt/sources.list | sed "s/archive.ubuntu.com/mirrors.nic.funet.fi/g" > temp \&\& mv temp /etc/apt/sources.list \
\napt-get update \&\& apt-get -y install gcc g++ \
\nexport DEBIAN_FRONTEND=noninteractive \
\nbash install-deps.sh#' Ftt.def
apptainer build Ftt.sif Ftt.def

# build container image for cluster and run-local modes (preferred)
build:
sudo singularity build Singularity.sif Singularity.def
Expand All @@ -53,7 +69,18 @@ dry-run:
--profile=profiles/$(PROFILE) \
--configfile $(CONFIG) \
-n \
$(TARGET)
$(TARGET) \
$(EXTRA) \

dry-run-hpc:
echo "Dry run with config $(CONFIG) and profile $(PROFILE)"
$(SNAKEMAKE) \
--profile=profiles/$(PROFILE) \
--configfile $(CONFIG) \
-n \
--conda-base-path=../bin \
$(TARGET) \
$(EXTRA)

test-dry-run: CONFIG=configs/config.test.yml
test-dry-run: dry-run
Expand All @@ -67,7 +94,18 @@ run:
$(SNAKEMAKE) \
--profile=profiles/$(PROFILE) \
--configfile $(CONFIG) \
$(TARGET)
$(TARGET) \
$(EXTRA)

run-hpc:
echo "Running with config $(CONFIG) and profile $(PROFILE)"
chmod +x profiles/$(PROFILE)/*
$(SNAKEMAKE) \
--profile=profiles/$(PROFILE) \
--configfile $(CONFIG) \
--conda-base-path=../bin \
$(TARGET) \
$(EXTRA)

test: CONFIG=configs/config.test.yml
test: run
Expand Down
Loading