In real-world tasks, machine learning engineers often have to solve multiple specific tasks at once rather than just one. To do this, they use BERT-like models that are pre-trained on a large volume of data and then fine-tuned for each of the specific tasks. So, they are single-task models.
However, hosting many single-task models can be GPU and RAM memory-intensive, leading to high costs.
To address this, we implemented Multi-Task Learning(MTL) in the DeepPavlov library. You can find the implementation notebook here.
Just like humans can leverage knowledge from past interactions with various people to improve their ability to make guesses about new companions, multi-task models learn to leverage information from multiple tasks to improve their overall performance and generalization capabilities.
# Install the right version
!pip install deeppavlov==1.1.1
from deeppavlov import build_model, configs
model = build_model('multitask_example', download=True, install=True)
# If you use your config from scratch, it should look like
# model = build_model('path/to/your/config.json')
tasks =['cola', 'rte', 'stsb', 'copa', 'conll']
# the same order as config
x=dict()
for task in tasks: # Buillding input
if task=='rte': # Sentence pair classification/regression
# Example can be a tuple
x[task]=[('pair 1 phrase 1', 'pair 1 phrase 2'),
('pair 2 phrase 1', 'pair 2 phrase 2')]
elif task=='cola': # Single sentence classification/regression
# Example can be a string
x[task]=['phrase1']
elif task=='conll': # NER
# For NER, examples are strings
x[task]=['first second'] # NER
elif task=='stsb': # Single sentence regression.
#Examples for any task can be empty, like in that case
x[task]=[]
elif task=='copa':
x[task]=[('context in pair 1', ['choice 1 in pair 1', 'choice 2 in pair 1']),
('context in pair 2', ['choice 1 in pair 2', 'choice 2 in pair 2'])]
# Illustrating multiple choice task
else:
x[task]=['test phrase']
list_of_x = [x[task] for task in tasks]
list_of_y = [[] for _ in tasks]
args = list_of_x + list_of_y
outputs=model(*args)
{
"dataset_reader": {
"class_name": "multitask_reader",
"task_defaults": {
"class_name": "huggingface_dataset_reader",
"path": "glue",
"train": "train",
"valid": "validation",
"test": "test"
},
"tasks": {
"rte": {"name": "rte"},
"copa": {
"path": "super_glue",
"name": "copa"
},
"conll": {
"class_name": "conll2003_reader",
"use_task_defaults": false,
"data_path": "{DOWNLOADS_PATH}/conll2003/",
"dataset_name": "conll2003",
"provide_pos": false
}
}
},
"dataset_iterator": {
"class_name": "multitask_iterator",
"num_train_epochs": "{NUM_TRAIN_EPOCHS}",
"gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}",
"seed": 42,
"task_defaults": {
"class_name": "huggingface_dataset_iterator",
"label": "label",
"use_label_name": false,
"seed": 42
},
"tasks": {
"rte": {
"features": ["sentence1", "sentence2"]
},
"copa": {
"features": ["contexts", "choices"]
},
"conll": {
"class_name": "basic_classification_iterator",
"seed": 42,
"use_task_defaults": false
}
}
},
"chainer": {
"in": ["x_rte", "x_copa", "x_conll"],
"in_y": ["y_rte", "y_copa", "y_conll"],
"pipe": [
{
"class_name": "multitask_pipeline_preprocessor",
"possible_keys_to_extract": [0, 1],
"preprocessors": [
"TorchTransformersPreprocessor",
"TorchTransformersMultiplechoicePreprocessor",
"TorchTransformersNerPreprocessor"
],
"do_lower_case": true,
"n_task": 3,
"vocab_file": "{BACKBONE}",
"max_seq_length": 200,
"max_subword_length": 15,
"token_masking_prob": 0.0,
"return_features": true,
"in": ["x_rte", "x_copa", "x_conll"],
"out": [
"bert_features_rte",
"bert_features_copa",
"bert_features_conll"
]
},
{
"id": "vocab_conll",
"class_name": "simple_vocab",
"unk_token": ["O"],
"pad_with_zeros": true,
"save_path": "{MODELS_PATH}/tag.dict",
"load_path": "{MODELS_PATH}/tag.dict",
"fit_on": ["y_conll"],
"in": ["y_conll"],
"out": ["y_ids_conll"]
},
{
"id": "multitask_transformer",
"class_name": "multitask_transformer",
"optimizer_parameters": {"lr": 2e-5},
"gradient_accumulation_steps": "{GRADIENT_ACC_STEPS}",
"learning_rate_drop_patience": 2,
"learning_rate_drop_div": 2.0,
"return_probas": true,
"backbone_model": "{BACKBONE}",
"save_path": "{MODEL_PATH}",
"load_path": "{MODEL_PATH}",
"tasks": {
"rte": {
"type": "classification",
"options": 2
},
"copa": {
"type": "multiple_choice",
"options": 2
},
"conll": {
"type": "sequence_labeling",
"options": "#vocab_conll.len"
}
},
"in": [
"bert_features_rte",
"bert_features_copa",
"bert_features_conll"
],
"in_y": ["y_rte", "y_copa", "y_ids_conll"],
"out": [
"y_rte_pred_probas",
"y_copa_pred_probas",
"y_conll_pred_ids"
]
},
{
"in": ["y_rte_pred_probas"],
"out": ["y_rte_pred_ids"],
"class_name": "proba2labels",
"max_proba": true
},
{
"in": ["y_copa_pred_probas"],
"out": ["y_copa_pred_ids"],
"class_name": "proba2labels",
"max_proba": true
},
{
"in": ["y_conll_pred_ids"],
"out": ["y_conll_pred_labels"],
"ref": "vocab_conll"
}
],
"out": ["y_rte_pred_ids", "y_copa_pred_ids", "y_conll_pred_labels"]
},
"train": {
"epochs": "{NUM_TRAIN_EPOCHS}",
"batch_size": 32,
"metrics": [
{
"name": "multitask_accuracy",
"inputs": ["y_rte", "y_copa", "y_rte_pred_ids", "y_copa_pred_ids"]
},
{
"name": "ner_f1",
"inputs": ["y_conll", "y_conll_pred_labels"]
},
{
"name": "ner_token_f1",
"inputs": ["y_conll", "y_conll_pred_labels"]
},
{
"name": "accuracy",
"alias": "accuracy_rte",
"inputs": ["y_rte", "y_rte_pred_ids"]
},
{
"name": "accuracy",
"alias": "accuracy_copa",
"inputs": ["y_copa", "y_copa_pred_ids"]
}
],
"validation_patience": 3,
"log_every_n_epochs": 1,
"show_examples": false,
"evaluation_targets": ["valid"],
"class_name": "torch_trainer",
"pytest_max_batches": 2
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"MODELS_PATH": "{ROOT_PATH}/models/multitask_example",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"BACKBONE": "distilbert-base-uncased",
"MODEL_PATH": "{MODELS_PATH}/{BACKBONE}_3task",
"NUM_TRAIN_EPOCHS": 5,
"GRADIENT_ACC_STEPS": 1
}
}
}
python -m deeppavlov train mtl_3task.json