diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 3cab0156..7c0caca0 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -438,7 +438,8 @@ def __create_auxiliary_ds(rec): def _create_phase10_ds( - generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]] + generated_dataset: Dataset, + auxiliary_inst: Optional[Dict[str, List[str]]], ): """ Create a dataset for Phase 1.0 of downstream training. @@ -450,13 +451,20 @@ def _create_phase10_ds( knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=True ) - knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + # Include phase07 + pretraining_knowledge_ds = _generate_knowledge_qa_dataset( + generated_dataset, keep_context_separate=False + ).map(_conv_pretrain) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) + if auxiliary_dataset is not None: - phase10 = concatenate_datasets([knowledge_ds, auxiliary_dataset]) + phase10 = concatenate_datasets( + [raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset] + ) else: - phase10 = knowledge_ds + phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds]) return phase10