transformers/train_clm_with_hf_trainer.py

# This script is hacked and modified from https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
# For more specified training tasks, please refer https://github.com/huggingface/transformers/tree/main/examples/pytorch
import dataclasses
import logging
import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional

import transformers
from transformers import (
    MODEL_FOR_CAUSAL_LM_MAPPING,
    TrainingArguments,
    default_data_collator,
    is_torch_tpu_available,
    set_seed,
)
from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint

logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


def dataclass_from_dict(klass, d):
    try:
        fieldtypes = {f.name: f.type for f in dataclasses.fields(klass)}
        return klass(**{f: dataclass_from_dict(fieldtypes[f], d[f]) for f in d})
    except:
        return d  # Not a dataclass field


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )

    block_size: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Optional input sequence length after tokenization. "
                "The training dataset will be truncated in block of this size for training. "
                "Default to the model max input length for single sentence inputs (take into account special tokens)."
            )
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    validation_split_percentage: Optional[int] = field(
        default=5,
        metadata={
            "help": "The percentage of the train set used as validation set in case there's no validation split"
        },
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    keep_linebreaks: bool = field(
        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."


def train_clm_with_hf_trainer(model,
                              tokenizer,
                              data_args,
                              training_args,
                              **kwargs):
    import evaluate
    import datasets
    from transformers import Trainer
    from datasets import load_dataset

    print('train clm with hugging face transformers trainer')

    data_args = dataclass_from_dict(DataTrainingArguments, data_args)
    training_args = dataclass_from_dict(TrainingArguments, training_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            # cache_dir=model_args.cache_dir,
            # use_auth_token=True if model_args.use_auth_token else None,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                # cache_dir=model_args.cache_dir,
                # use_auth_token=True if model_args.use_auth_token else None,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                # cache_dir=model_args.cache_dir,
                # use_auth_token=True if model_args.use_auth_token else None,
            )
    else:
        data_files = {}
        dataset_args = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = (
            data_args.train_file.split(".")[-1]
            if data_args.train_file is not None
            else data_args.validation_file.split(".")[-1]
        )
        if extension == "txt":
            extension = "text"
            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            # cache_dir=model_args.cache_dir,
            # use_auth_token=True if model_args.use_auth_token else None,
            **dataset_args,
        )
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                # cache_dir=model_args.cache_dir,
                # use_auth_token=True if model_args.use_auth_token else None,
                **dataset_args,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                # cache_dir=model_args.cache_dir,
                # use_auth_token=True if model_args.use_auth_token else None,
                **dataset_args,
            )

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
    # on a small vocab and want a smaller embedding size, remove this test.
    embedding_size = model.get_input_embeddings().weight.shape[0]
    if len(tokenizer) > embedding_size:
        model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    else:
        column_names = raw_datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

    def tokenize_function(examples):
        with CaptureLogger(tok_logger) as cl:
            output = tokenizer(examples[text_column_name])
        # clm input could be much much longer than block_size
        if "Token indices sequence length is longer than the" in cl.out:
            tok_logger.warning(
                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
                " before being passed to the model."
            )
        return output

    with training_args.main_process_first(desc="dataset map tokenization"):
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )

    if data_args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
            block_size = 1024
    else:
        if data_args.block_size > tokenizer.model_max_length:
            logger.warning(
                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
        block_size = min(data_args.block_size, tokenizer.model_max_length)

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
    # to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    with training_args.main_process_first(desc="grouping texts together"):
        lm_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
            desc=f"Grouping texts in chunks of {block_size}",
        )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = lm_datasets["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))

    if training_args.do_eval:
        if "validation" not in tokenized_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = lm_datasets["validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))

        def preprocess_logits_for_metrics(logits, labels):
            if isinstance(logits, tuple):
                # Depending on the model and config, logits may contain extra tensors,
                # like past_key_values, but logits always come first
                logits = logits[0]
            return logits.argmax(dim=-1)

        metric = evaluate.load("accuracy")

        def compute_metrics(eval_preds):
            preds, labels = eval_preds
            # preds have the same shape as the labels, after the argmax(-1) has been calculated
            # by preprocess_logits_for_metrics but we need to shift the labels
            labels = labels[:, 1:].reshape(-1)
            preds = preds[:, :-1].reshape(-1)
            return metric.compute(predictions=preds, references=labels)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
        compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics
        if training_args.do_eval and not is_torch_tpu_available()
        else None,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        try:
            perplexity = math.exp(metrics["eval_loss"])
        except OverflowError:
            perplexity = float("inf")
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    print('done clm.')
train mlm and clm task 2 years ago			`# This script is hacked and modified from https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py`
			`# For more specified training tasks, please refer https://github.com/huggingface/transformers/tree/main/examples/pytorch`
			`import dataclasses`
			`import logging`
			`import math`
			`import os`
			`import sys`
			`from dataclasses import dataclass, field`
			`from itertools import chain`
			`from typing import Optional`

			`import transformers`
			`from transformers import (`
			`MODEL_FOR_CAUSAL_LM_MAPPING,`
			`TrainingArguments,`
			`default_data_collator,`
			`is_torch_tpu_available,`
			`set_seed,`
			`)`
			`from transformers.testing_utils import CaptureLogger`
			`from transformers.trainer_utils import get_last_checkpoint`

			`logger = logging.getLogger(__name__)`

			`MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())`
			`MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)`


			`def dataclass_from_dict(klass, d):`
			`try:`
			`fieldtypes = {f.name: f.type for f in dataclasses.fields(klass)}`
			`return klass(**{f: dataclass_from_dict(fieldtypes[f], d[f]) for f in d})`
			`except:`
			`return d # Not a dataclass field`


			`@dataclass`
			`class DataTrainingArguments:`
			`"""`
			`Arguments pertaining to what data we are going to input our model for training and eval.`
			`"""`

			`dataset_name: Optional[str] = field(`
			`default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}`
			`)`
			`dataset_config_name: Optional[str] = field(`
			`default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}`
			`)`
			`train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})`
			`validation_file: Optional[str] = field(`
			`default=None,`
			`metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},`
			`)`
			`max_train_samples: Optional[int] = field(`
			`default=None,`
			`metadata={`
			`"help": (`
			`"For debugging purposes or quicker training, truncate the number of training examples to this "`
			`"value if set."`
			`)`
			`},`
			`)`
			`max_eval_samples: Optional[int] = field(`
			`default=None,`
			`metadata={`
			`"help": (`
			`"For debugging purposes or quicker training, truncate the number of evaluation examples to this "`
			`"value if set."`
			`)`
			`},`
			`)`

			`block_size: Optional[int] = field(`
			`default=None,`
			`metadata={`
			`"help": (`
			`"Optional input sequence length after tokenization. "`
			`"The training dataset will be truncated in block of this size for training. "`
			`"Default to the model max input length for single sentence inputs (take into account special tokens)."`
			`)`
			`},`
			`)`
			`overwrite_cache: bool = field(`
			`default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}`
			`)`
			`validation_split_percentage: Optional[int] = field(`
			`default=5,`
			`metadata={`
			`"help": "The percentage of the train set used as validation set in case there's no validation split"`
			`},`
			`)`
			`preprocessing_num_workers: Optional[int] = field(`
			`default=None,`
			`metadata={"help": "The number of processes to use for the preprocessing."},`
			`)`
			`keep_linebreaks: bool = field(`
			`default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}`
			`)`

			`def __post_init__(self):`
			`if self.dataset_name is None and self.train_file is None and self.validation_file is None:`
			`raise ValueError("Need either a dataset name or a training/validation file.")`
			`else:`
			`if self.train_file is not None:`
			`extension = self.train_file.split(".")[-1]`
			assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
			`if self.validation_file is not None:`
			`extension = self.validation_file.split(".")[-1]`
			assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."


			`def train_clm_with_hf_trainer(model,`
			`tokenizer,`
			`data_args,`
			`training_args,`
			`**kwargs):`
lazy import evaluate and datasets for avoiding potential error. 2 years ago			`import evaluate`
			`import datasets`
lazy import Trainer for avoiding potential error. 2 years ago			`from transformers import Trainer`
lazy import evaluate and datasets for avoiding potential error. 2 years ago			`from datasets import load_dataset`

train mlm and clm task 2 years ago			`print('train clm with hugging face transformers trainer')`

			`data_args = dataclass_from_dict(DataTrainingArguments, data_args)`
			`training_args = dataclass_from_dict(TrainingArguments, training_args)`

			`# Setup logging`
			`logging.basicConfig(`
			`format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",`
			`datefmt="%m/%d/%Y %H:%M:%S",`
			`handlers=[logging.StreamHandler(sys.stdout)],`
			`)`

			`log_level = training_args.get_process_log_level()`
			`logger.setLevel(log_level)`
			`datasets.utils.logging.set_verbosity(log_level)`
			`transformers.utils.logging.set_verbosity(log_level)`
			`transformers.utils.logging.enable_default_handler()`
			`transformers.utils.logging.enable_explicit_format()`

			`# Log on each process the small summary:`
			`logger.warning(`
			`f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"`
			`+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"`
			`)`
			`logger.info(f"Training/evaluation parameters {training_args}")`

			`# Detecting last checkpoint.`
			`last_checkpoint = None`
			`if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:`
			`last_checkpoint = get_last_checkpoint(training_args.output_dir)`
			`if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:`
			`raise ValueError(`
			`f"Output directory ({training_args.output_dir}) already exists and is not empty. "`
			`"Use --overwrite_output_dir to overcome."`
			`)`
			`elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:`
			`logger.info(`
			`f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "`
			"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
			`)`

			`# Set seed before initializing model.`
			`set_seed(training_args.seed)`

			`# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)`
			`# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/`
			`# (the dataset will be downloaded automatically from the datasets Hub).`
			`#`
			`# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called`
			`# 'text' is found. You can easily tweak this behavior (see below).`
			`#`
			`# In distributed training, the load_dataset function guarantee that only one local process can concurrently`
			`# download the dataset.`
			`if data_args.dataset_name is not None:`
			`# Downloading and loading a dataset from the hub.`
			`raw_datasets = load_dataset(`
			`data_args.dataset_name,`
			`data_args.dataset_config_name,`
			`# cache_dir=model_args.cache_dir,`
			`# use_auth_token=True if model_args.use_auth_token else None,`
			`)`
			`if "validation" not in raw_datasets.keys():`
			`raw_datasets["validation"] = load_dataset(`
			`data_args.dataset_name,`
			`data_args.dataset_config_name,`
			`split=f"train[:{data_args.validation_split_percentage}%]",`
			`# cache_dir=model_args.cache_dir,`
			`# use_auth_token=True if model_args.use_auth_token else None,`
			`)`
			`raw_datasets["train"] = load_dataset(`
			`data_args.dataset_name,`
			`data_args.dataset_config_name,`
			`split=f"train[{data_args.validation_split_percentage}%:]",`
			`# cache_dir=model_args.cache_dir,`
			`# use_auth_token=True if model_args.use_auth_token else None,`
			`)`
			`else:`
			`data_files = {}`
			`dataset_args = {}`
			`if data_args.train_file is not None:`
			`data_files["train"] = data_args.train_file`
			`if data_args.validation_file is not None:`
			`data_files["validation"] = data_args.validation_file`
			`extension = (`
			`data_args.train_file.split(".")[-1]`
			`if data_args.train_file is not None`
			`else data_args.validation_file.split(".")[-1]`
			`)`
			`if extension == "txt":`
			`extension = "text"`
			`dataset_args["keep_linebreaks"] = data_args.keep_linebreaks`
			`raw_datasets = load_dataset(`
			`extension,`
			`data_files=data_files,`
			`# cache_dir=model_args.cache_dir,`
			`# use_auth_token=True if model_args.use_auth_token else None,`
			`**dataset_args,`
			`)`
			`# If no validation data is there, validation_split_percentage will be used to divide the dataset.`
			`if "validation" not in raw_datasets.keys():`
			`raw_datasets["validation"] = load_dataset(`
			`extension,`
			`data_files=data_files,`
			`split=f"train[:{data_args.validation_split_percentage}%]",`
			`# cache_dir=model_args.cache_dir,`
			`# use_auth_token=True if model_args.use_auth_token else None,`
			`**dataset_args,`
			`)`
			`raw_datasets["train"] = load_dataset(`
			`extension,`
			`data_files=data_files,`
			`split=f"train[{data_args.validation_split_percentage}%:]",`
			`# cache_dir=model_args.cache_dir,`
			`# use_auth_token=True if model_args.use_auth_token else None,`
			`**dataset_args,`
			`)`

			`# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at`
			`# https://huggingface.co/docs/datasets/loading_datasets.html.`

			`# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch`
			`# on a small vocab and want a smaller embedding size, remove this test.`
			`embedding_size = model.get_input_embeddings().weight.shape[0]`
			`if len(tokenizer) > embedding_size:`
			`model.resize_token_embeddings(len(tokenizer))`

			`# Preprocessing the datasets.`
			`# First we tokenize all the texts.`
			`if training_args.do_train:`
			`column_names = raw_datasets["train"].column_names`
			`else:`
			`column_names = raw_datasets["validation"].column_names`
			`text_column_name = "text" if "text" in column_names else column_names[0]`

			`# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function`
			`tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")`

			`def tokenize_function(examples):`
			`with CaptureLogger(tok_logger) as cl:`
			`output = tokenizer(examples[text_column_name])`
			`# clm input could be much much longer than block_size`
			`if "Token indices sequence length is longer than the" in cl.out:`
			`tok_logger.warning(`
			`"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"`
			`" before being passed to the model."`
			`)`
			`return output`

			`with training_args.main_process_first(desc="dataset map tokenization"):`
			`tokenized_datasets = raw_datasets.map(`
			`tokenize_function,`
			`batched=True,`
			`num_proc=data_args.preprocessing_num_workers,`
			`remove_columns=column_names,`
			`load_from_cache_file=not data_args.overwrite_cache,`
			`desc="Running tokenizer on dataset",`
			`)`

			`if data_args.block_size is None:`
			`block_size = tokenizer.model_max_length`
			`if block_size > 1024:`
			`logger.warning(`
			f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
			`"Picking 1024 instead. You can change that default value by passing --block_size xxx."`
			`)`
			`block_size = 1024`
			`else:`
			`if data_args.block_size > tokenizer.model_max_length:`
			`logger.warning(`
			`f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"`
			`f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."`
			`)`
			`block_size = min(data_args.block_size, tokenizer.model_max_length)`

			`# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.`
			`def group_texts(examples):`
			`# Concatenate all texts.`
			`concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}`
			`total_length = len(concatenated_examples[list(examples.keys())[0]])`
			`# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can`
			`# customize this part to your needs.`
			`if total_length >= block_size:`
			`total_length = (total_length // block_size) * block_size`
			`# Split by chunks of max_len.`
			`result = {`
lazy import evaluate and datasets for avoiding potential error. 2 years ago			`k: [t[i: i + block_size] for i in range(0, total_length, block_size)]`
train mlm and clm task 2 years ago			`for k, t in concatenated_examples.items()`
			`}`
			`result["labels"] = result["input_ids"].copy()`
			`return result`

			# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
			`# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower`
			`# to preprocess.`
			`#`
			`# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:`
			`# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map`

			`with training_args.main_process_first(desc="grouping texts together"):`
			`lm_datasets = tokenized_datasets.map(`
			`group_texts,`
			`batched=True,`
			`num_proc=data_args.preprocessing_num_workers,`
			`load_from_cache_file=not data_args.overwrite_cache,`
			`desc=f"Grouping texts in chunks of {block_size}",`
			`)`

			`if training_args.do_train:`
			`if "train" not in tokenized_datasets:`
			`raise ValueError("--do_train requires a train dataset")`
			`train_dataset = lm_datasets["train"]`
			`if data_args.max_train_samples is not None:`
			`max_train_samples = min(len(train_dataset), data_args.max_train_samples)`
			`train_dataset = train_dataset.select(range(max_train_samples))`

			`if training_args.do_eval:`
			`if "validation" not in tokenized_datasets:`
			`raise ValueError("--do_eval requires a validation dataset")`
			`eval_dataset = lm_datasets["validation"]`
			`if data_args.max_eval_samples is not None:`
			`max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)`
			`eval_dataset = eval_dataset.select(range(max_eval_samples))`

			`def preprocess_logits_for_metrics(logits, labels):`
			`if isinstance(logits, tuple):`
			`# Depending on the model and config, logits may contain extra tensors,`
			`# like past_key_values, but logits always come first`
			`logits = logits[0]`
			`return logits.argmax(dim=-1)`

			`metric = evaluate.load("accuracy")`

			`def compute_metrics(eval_preds):`
			`preds, labels = eval_preds`
			`# preds have the same shape as the labels, after the argmax(-1) has been calculated`
			`# by preprocess_logits_for_metrics but we need to shift the labels`
			`labels = labels[:, 1:].reshape(-1)`
			`preds = preds[:, :-1].reshape(-1)`
			`return metric.compute(predictions=preds, references=labels)`

			`# Initialize our Trainer`
			`trainer = Trainer(`
			`model=model,`
			`args=training_args,`
			`train_dataset=train_dataset if training_args.do_train else None,`
			`eval_dataset=eval_dataset if training_args.do_eval else None,`
			`tokenizer=tokenizer,`
			`# Data collator will default to DataCollatorWithPadding, so we change it.`
			`data_collator=default_data_collator,`
			`compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,`
			`preprocess_logits_for_metrics=preprocess_logits_for_metrics`
			`if training_args.do_eval and not is_torch_tpu_available()`
			`else None,`
			`)`

			`# Training`
			`if training_args.do_train:`
			`checkpoint = None`
			`if training_args.resume_from_checkpoint is not None:`
			`checkpoint = training_args.resume_from_checkpoint`
			`elif last_checkpoint is not None:`
			`checkpoint = last_checkpoint`
			`train_result = trainer.train(resume_from_checkpoint=checkpoint)`
			`trainer.save_model() # Saves the tokenizer too for easy upload`
			`metrics = train_result.metrics`

			`max_train_samples = (`
			`data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)`
			`)`
			`metrics["train_samples"] = min(max_train_samples, len(train_dataset))`

			`trainer.log_metrics("train", metrics)`
			`trainer.save_metrics("train", metrics)`
			`trainer.save_state()`

			`# Evaluation`
			`if training_args.do_eval:`
			`logger.info("* Evaluate *")`

			`metrics = trainer.evaluate()`

			`max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)`
			`metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))`
			`try:`
			`perplexity = math.exp(metrics["eval_loss"])`
			`except OverflowError:`
			`perplexity = float("inf")`
			`metrics["perplexity"] = perplexity`

			`trainer.log_metrics("eval", metrics)`
			`trainer.save_metrics("eval", metrics)`

lazy import evaluate and datasets for avoiding potential error. 2 years ago			`print('done clm.')`