Skip to content

Commit

Permalink
Merge pull request #368 from instructlab/mergify/bp/release-v0.3/pr-366
Browse files Browse the repository at this point in the history
Data mix fix (backport #366)
  • Loading branch information
bbrowning authored Nov 13, 2024
2 parents f23c952 + 625d9ab commit fbfe7d4
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,8 @@ def __create_auxiliary_ds(rec):


def _create_phase10_ds(
generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]]
generated_dataset: Dataset,
auxiliary_inst: Optional[Dict[str, List[str]]],
):
"""
Create a dataset for Phase 1.0 of downstream training.
Expand All @@ -450,13 +451,20 @@ def _create_phase10_ds(
knowledge_ds = _generate_knowledge_qa_dataset(
generated_dataset, keep_context_separate=True
)
knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4)
raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4)
# Include phase07
pretraining_knowledge_ds = _generate_knowledge_qa_dataset(
generated_dataset, keep_context_separate=False
).map(_conv_pretrain)

auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst)

if auxiliary_dataset is not None:
phase10 = concatenate_datasets([knowledge_ds, auxiliary_dataset])
phase10 = concatenate_datasets(
[raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset]
)
else:
phase10 = knowledge_ds
phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds])
return phase10


Expand Down

0 comments on commit fbfe7d4

Please sign in to comment.