[{"data":1,"prerenderedAt":427},["ShallowReactive",2],{"content-query-36H4EYNxyD":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":421,"_id":422,"_source":423,"_file":424,"_stem":425,"_extension":426},"/technology-blogs/zh/3528","zh",false,"","基于MindSpore NLP的Roberta模型Prompt Tuning","作者：ethan__chen   原文链接：https://www.hiascend.com/developer/blog/details/02107166711681514014","2024-12-09","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/01/15/19609e81025f4d0781ec6276350c4faf.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":418},"root",[17,25,37,53,58,66,71,79,84,94,102,110,115,123,131,139,144,152,160,168,173,181,189,197,202,210,218,226,231,239,247,255,260,268,276,284,289,297,305,310,317,325,330,338,345,353,358,363,368,373,378,386,391,405,413],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"基于mindspore-nlp的roberta模型prompt-tuning",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,31],{"type":24,"value":30},"**作****者：**",{"type":18,"tag":32,"props":33,"children":34},"strong",{},[35],{"type":24,"value":36},"ethan__chen",{"type":18,"tag":26,"props":38,"children":39},{},[40,42],{"type":24,"value":41},"**原文链接：**",{"type":18,"tag":32,"props":43,"children":44},{},[45],{"type":18,"tag":46,"props":47,"children":51},"a",{"href":48,"rel":49},"https://www.hiascend.com/developer/blog/details/02107166711681514014",[50],"nofollow",[52],{"type":24,"value":48},{"type":18,"tag":26,"props":54,"children":55},{},[56],{"type":24,"value":57},"本文档介绍了如何基于MindSpore NLP进行Roberta模型的Prompt Tuning，主要用于GLUE基准数据集的微调。本文提供了完整的代码示例以及详细的步骤说明，便于理解和复现实验。",{"type":18,"tag":26,"props":59,"children":60},{},[61],{"type":18,"tag":32,"props":62,"children":63},{},[64],{"type":24,"value":65},"配置环境",{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":24,"value":70},"在运行此代码前，请确保MindSpore NLP库已经安装。本文档基于大模型平台运行，因此需要进行适当的环境配置，确保代码可以在相应的平台上运行。",{"type":18,"tag":26,"props":72,"children":73},{},[74],{"type":18,"tag":32,"props":75,"children":76},{},[77],{"type":24,"value":78},"模型与数据集加载",{"type":18,"tag":26,"props":80,"children":81},{},[82],{"type":24,"value":83},"在本案例中，我们使用 roberta-large 模型并基于GLUE基准数据集进行Prompt Tuning。GLUE (General Language Understanding Evaluation) 是自然语言处理中的标准评估基准，包括多个子任务，如句子相似性匹配、自然语言推理等。Prompt Tuning是一种新的微调技术，通过插入虚拟的“提示”Token在模型的输入中，以微调较少的参数达到较好的性能。",{"type":18,"tag":85,"props":86,"children":88},"pre",{"code":87},"import mindspore\nfrom tqdm import tqdm\nfrom mindnlp import evaluate\nfrom mindnlp.dataset import load_dataset\nfrom mindnlp.transformers import AutoModelForSequenceClassification, AutoTokenizer\nfrom mindnlp.core.optim import AdamW\nfrom mindnlp.transformers.optimization import get_linear_schedule_with_warmup\nfrom mindnlp.peft import (\n    get_peft_model,\n    PeftType,\n    PromptTuningConfig,\n)\n",[89],{"type":18,"tag":90,"props":91,"children":92},"code",{"__ignoreMap":7},[93],{"type":24,"value":87},{"type":18,"tag":26,"props":95,"children":96},{},[97],{"type":18,"tag":32,"props":98,"children":99},{},[100],{"type":24,"value":101},"0****1",{"type":18,"tag":26,"props":103,"children":104},{},[105],{"type":18,"tag":32,"props":106,"children":107},{},[108],{"type":24,"value":109},"定义训练参数",{"type":18,"tag":26,"props":111,"children":112},{},[113],{"type":24,"value":114},"首先，定义模型名称、数据集任务名称、Prompt Tuning类型、训练轮数等基本参数。",{"type":18,"tag":85,"props":116,"children":118},{"code":117},"batch_size = 32\nmodel_name_or_path = \"roberta-large\"\ntask = \"mrpc\"\npeft_type = PeftType.PROMPT_TUNING\nnum_epochs = 20\n",[119],{"type":18,"tag":90,"props":120,"children":121},{"__ignoreMap":7},[122],{"type":24,"value":117},{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":18,"tag":32,"props":127,"children":128},{},[129],{"type":24,"value":130},"0****2",{"type":18,"tag":26,"props":132,"children":133},{},[134],{"type":18,"tag":32,"props":135,"children":136},{},[137],{"type":24,"value":138},"配置Prompt Tuning",{"type":18,"tag":26,"props":140,"children":141},{},[142],{"type":24,"value":143},"在Prompt Tuning的配置中，选择任务类型为\"SEQ_CLS\"（序列分类任务），并定义虚拟Token的数量。虚拟Token即为插入模型输入中的“提示”Token，通过这些Token的微调，使得模型能够更好地完成下游任务。",{"type":18,"tag":85,"props":145,"children":147},{"code":146},"peft_config = PromptTuningConfig(task_type=\"SEQ_CLS\", num_virtual_tokens=10)\nlr = 1e-3\n",[148],{"type":18,"tag":90,"props":149,"children":150},{"__ignoreMap":7},[151],{"type":24,"value":146},{"type":18,"tag":26,"props":153,"children":154},{},[155],{"type":18,"tag":32,"props":156,"children":157},{},[158],{"type":24,"value":159},"0****3",{"type":18,"tag":26,"props":161,"children":162},{},[163],{"type":18,"tag":32,"props":164,"children":165},{},[166],{"type":24,"value":167},"加载Tokenizer",{"type":18,"tag":26,"props":169,"children":170},{},[171],{"type":24,"value":172},"根据模型类型选择padding的侧边，如果模型为GPT、OPT或BLOOM类模型，则从序列左侧填充（padding），否则从序列右侧填充。",{"type":18,"tag":85,"props":174,"children":176},{"code":175},"if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n    padding_side = \"left\"\nelse:\n    padding_side = \"right\"\n\ntokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\nif getattr(tokenizer, \"pad_token_id\") is None:\n    tokenizer.pad_token_id = tokenizer.eos_token_id\n",[177],{"type":18,"tag":90,"props":178,"children":179},{"__ignoreMap":7},[180],{"type":24,"value":175},{"type":18,"tag":26,"props":182,"children":183},{},[184],{"type":18,"tag":32,"props":185,"children":186},{},[187],{"type":24,"value":188},"0****4",{"type":18,"tag":26,"props":190,"children":191},{},[192],{"type":18,"tag":32,"props":193,"children":194},{},[195],{"type":24,"value":196},"加载数据集",{"type":18,"tag":26,"props":198,"children":199},{},[200],{"type":24,"value":201},"通过MindSpore NLP加载GLUE数据集，并打印样本以便确认数据格式。在此示例中，我们使用GLUE的MRPC（Microsoft Research Paraphrase Corpus）任务，该任务用于句子匹配，即判断两个句子是否表达相同的意思。",{"type":18,"tag":85,"props":203,"children":205},{"code":204},"datasets = load_dataset(\"glue\", task)\nprint(next(datasets['train'].create_dict_iterator()))\n",[206],{"type":18,"tag":90,"props":207,"children":208},{"__ignoreMap":7},[209],{"type":24,"value":204},{"type":18,"tag":26,"props":211,"children":212},{},[213],{"type":18,"tag":32,"props":214,"children":215},{},[216],{"type":24,"value":217},"0****5",{"type":18,"tag":26,"props":219,"children":220},{},[221],{"type":18,"tag":32,"props":222,"children":223},{},[224],{"type":24,"value":225},"数据预处理",{"type":18,"tag":26,"props":227,"children":228},{},[229],{"type":24,"value":230},"为了适配MindSpore NLP的数据处理流程，我们定义了一个映射函数 MapFunc，用于将句子转换为 input_ids 和 attention_mask，并对数据进行padding处理。",{"type":18,"tag":85,"props":232,"children":234},{"code":233},"from mindnlp.dataset import BaseMapFunction\n\nclass MapFunc(BaseMapFunction):\n    def __call__(self, sentence1, sentence2, label, idx):\n        outputs = tokenizer(sentence1, sentence2, truncation=True, max_length=None)\n        return outputs['input_ids'], outputs['attention_mask'], label\n\ndef get_dataset(dataset, tokenizer):\n    input_colums=['sentence1', 'sentence2', 'label', 'idx']\n    output_columns=['input_ids', 'attention_mask', 'labels']\n    dataset = dataset.map(MapFunc(input_colums, output_columns),\n                          input_colums, output_columns)\n    dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),\n                                                         'attention_mask': (None, 0)})\n    return dataset\n\ntrain_dataset = get_dataset(datasets['train'], tokenizer)\neval_dataset = get_dataset(datasets['validation'], tokenizer)\n",[235],{"type":18,"tag":90,"props":236,"children":237},{"__ignoreMap":7},[238],{"type":24,"value":233},{"type":18,"tag":26,"props":240,"children":241},{},[242],{"type":18,"tag":32,"props":243,"children":244},{},[245],{"type":24,"value":246},"0****6",{"type":18,"tag":26,"props":248,"children":249},{},[250],{"type":18,"tag":32,"props":251,"children":252},{},[253],{"type":24,"value":254},"设置评估指标",{"type":18,"tag":26,"props":256,"children":257},{},[258],{"type":24,"value":259},"我们使用 evaluate 模块加载评估指标（accuracy 和 F1-score）来评估模型的性能。",{"type":18,"tag":85,"props":261,"children":263},{"code":262},"metric = evaluate.load(\"./glue.py\", task)\n",[264],{"type":18,"tag":90,"props":265,"children":266},{"__ignoreMap":7},[267],{"type":24,"value":262},{"type":18,"tag":26,"props":269,"children":270},{},[271],{"type":18,"tag":32,"props":272,"children":273},{},[274],{"type":24,"value":275},"0****7",{"type":18,"tag":26,"props":277,"children":278},{},[279],{"type":18,"tag":32,"props":280,"children":281},{},[282],{"type":24,"value":283},"加载模型并配置Prompt Tuning",{"type":18,"tag":26,"props":285,"children":286},{},[287],{"type":24,"value":288},"加载 roberta-large 模型，并根据配置进行Prompt Tuning。可以看到，微调的参数量仅为总参数量的0.3%左右，节省了大量计算资源。",{"type":18,"tag":85,"props":290,"children":292},{"code":291},"model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)\nmodel = get_peft_model(model, peft_config)\nmodel.print_trainable_parameters()\n",[293],{"type":18,"tag":90,"props":294,"children":295},{"__ignoreMap":7},[296],{"type":24,"value":291},{"type":18,"tag":26,"props":298,"children":299},{},[300],{"type":18,"tag":32,"props":301,"children":302},{},[303],{"type":24,"value":304},"模型微调（Prompt Tuning）",{"type":18,"tag":26,"props":306,"children":307},{},[308],{"type":24,"value":309},"在Prompt Tuning中，训练过程中仅微调部分参数（主要是虚拟Token相关的参数），相比于传统微调而言，大大减少了需要调整的参数量，使得模型能够高效适应下游任务。",{"type":18,"tag":26,"props":311,"children":312},{},[313],{"type":18,"tag":32,"props":314,"children":315},{},[316],{"type":24,"value":101},{"type":18,"tag":26,"props":318,"children":319},{},[320],{"type":18,"tag":32,"props":321,"children":322},{},[323],{"type":24,"value":324},"优化器与学习率调整",{"type":18,"tag":26,"props":326,"children":327},{},[328],{"type":24,"value":329},"使用 AdamW 优化器，并设置线性学习率调整策略。",{"type":18,"tag":85,"props":331,"children":333},{"code":332},"optimizer = AdamW(params=model.parameters(), lr=lr)\n\n# Instantiate scheduler\nlr_scheduler = get_linear_schedule_with_warmup(\n    optimizer=optimizer,\n    num_warmup_steps=0.06 * (len(train_dataset) * num_epochs),\n    num_training_steps=(len(train_dataset) * num_epochs),\n)\n",[334],{"type":18,"tag":90,"props":335,"children":336},{"__ignoreMap":7},[337],{"type":24,"value":332},{"type":18,"tag":26,"props":339,"children":340},{},[341],{"type":18,"tag":32,"props":342,"children":343},{},[344],{"type":24,"value":130},{"type":18,"tag":26,"props":346,"children":347},{},[348],{"type":18,"tag":32,"props":349,"children":350},{},[351],{"type":24,"value":352},"训练数据集",{"type":18,"tag":26,"props":354,"children":355},{},[356],{"type":24,"value":357},"训练步骤如下：",{"type":18,"tag":26,"props":359,"children":360},{},[361],{"type":24,"value":362},"1、构建正向计算函数 forward_fn。",{"type":18,"tag":26,"props":364,"children":365},{},[366],{"type":24,"value":367},"2、定义梯度计算函数 grad_fn。",{"type":18,"tag":26,"props":369,"children":370},{},[371],{"type":24,"value":372},"3、定义每一步的训练逻辑 train_step。",{"type":18,"tag":26,"props":374,"children":375},{},[376],{"type":24,"value":377},"4、遍历数据集进行训练和评估，在每个 epoch 结束时，计算评估指标。",{"type":18,"tag":85,"props":379,"children":381},{"code":380},"def forward_fn(**batch):\n    outputs = model(**batch)\n    loss = outputs.loss\n    return loss\n\ngrad_fn = mindspore.value_and_grad(forward_fn, None, tuple(model.parameters()))\n\ndef train_step(**batch):\n    loss, grads = grad_fn(**batch)\n    optimizer.step(grads)\n    return loss\n\nfor epoch in range(num_epochs):\n    model.set_train()\n    train_total_size = train_dataset.get_dataset_size()\n    for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):\n        loss = train_step(**batch)\n        lr_scheduler.step()\n\n    model.set_train(False)\n    eval_total_size = eval_dataset.get_dataset_size()\n    for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)):\n        outputs = model(**batch)\n        predictions = outputs.logits.argmax(axis=-1)\n        predictions, references = predictions, batch[\"labels\"]\n        metric.add_batch(\n            predictions=predictions,\n            references=references,\n        )\n\n    eval_metric = metric.compute()\n    print(f\"epoch {epoch}:\", eval_metric)\n",[382],{"type":18,"tag":90,"props":383,"children":384},{"__ignoreMap":7},[385],{"type":24,"value":380},{"type":18,"tag":26,"props":387,"children":388},{},[389],{"type":24,"value":390},"在每个 epoch 后，程序输出当前模型的评估指标（accuracy 和 F1-score）。从结果中可以看到，模型的准确率和 F1-score 会随着训练的进展逐渐提升。",{"type":18,"tag":26,"props":392,"children":393},{},[394,399,401],{"type":18,"tag":395,"props":396,"children":398},"img",{"alt":7,"src":397},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/12/13/5811a73bcccb43c5a605f813ff3d7c2a.png",[],{"type":24,"value":400}," ",{"type":18,"tag":395,"props":402,"children":404},{"alt":7,"src":403},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/12/13/3fef73baceab407c8cd0af34c1ff7047.png",[],{"type":18,"tag":26,"props":406,"children":407},{},[408],{"type":18,"tag":32,"props":409,"children":410},{},[411],{"type":24,"value":412},"总结",{"type":18,"tag":26,"props":414,"children":415},{},[416],{"type":24,"value":417},"本案例通过Prompt Tuning技术，在Roberta模型上进行了微调以适应GLUE数据集任务。通过控制微调参数量，Prompt Tuning展示了较强的高效性。",{"title":7,"searchDepth":419,"depth":419,"links":420},4,[],"markdown","content:technology-blogs:zh:3528.md","content","technology-blogs/zh/3528.md","technology-blogs/zh/3528","md",1776506130573]