1、安装funasr环境

https://gitcode.com/GitHub_Trending/fun/FunASR/?utm_source=artical_gitcode&index=top&type=card&&uuid_tt_dd=10_19447058120-1757334614754-989747&isLogin=1&from_id=151430839&from_link=67f467b7d9a19a1d966f85bd36300145

2、数据集准备:

如果是训练paraformer模型,只需要准备train_wav.scp和train_text.txt以及验证集val_wav.scp和val_text.txt即可。
如果是训练SenseVoice模型,我们需要准备下面几个文件:

train_text.txt
train_wav.scp
train_text_language.txt
train_emo.txt
train_event.txt

其中必须的是train_wav.scp和train_text.txt文件。

下面是每个文件的格式要求:

BAC009S0764W0121 甚至出现交易几乎停滞的情况
BAC009S0916W0489 湖北一公司以员工名义贷款数十员工负债千万
asr_example_cn_en 所有只要处理 data 不管你是做 machine learning 做 deep learning 做 data analytics 做 data science 也好 scientist 也好通通都要都做的基本功啊那 again 先先对有一些 > 也许对
ID0012W0014 he tried to think how it could be
 
注意:如果是中英混杂的句子,中文和英文之间要有空格。

BAC009S0764W0121 /root/train/data/BAC009S0764W0121.wav
BAC009S0916W0489 /root/train/data/BAC009S0916W0489.wav
asr_example_cn_en /root/train/data/asr_example_cn_en.wav
ID0012W0014 /root/train/data/ID0012W0014.wav
注意:音频路径不能使用URL路径,使用URL路径会导致生成的jsonl文件没有内容。

BAC009S0764W0121 <|zh|>
BAC009S0916W0489 <|zh|>
asr_example_cn_en <|zh|>
ID0012W0014 <|en|>
BAC009S0764W0121 <|NEUTRAL|>
BAC009S0916W0489 <|NEUTRAL|>
asr_example_cn_en <|NEUTRAL|>
ID0012W0014 <|NEUTRAL|>
BAC009S0764W0121 <|Speech|>
BAC009S0916W0489 <|Speech|>
asr_example_cn_en <|Speech|>
ID0012W0014 <|Speech|>
下面生成jsonl文件时需要注意,如果你只准备了train_wav.scp和train_text.txt文件,那么执行下面这样的命令。

# generate train.jsonl and val.jsonl from wav.scp and text.txt
sensevoice2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl" \
++model_dir='iic/SenseVoiceSmall'
注意:SenseVoice会自动打标。

如果是准备了全部文件,那么执行下面命令。

# generate train.jsonl and val.jsonl from wav.scp, text.txt, text_language.txt, emo_target.txt, event_target.txt
sensevoice2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt", "../../../data/list/train_text_language.txt", "../../../data/list/train_emo.txt", "../../../data/list/train_event.txt"]' \
++data_type_list='["source", "target", "text_language", "emo_target", "event_target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
生成的jsonl文件会保存到上面指定的路径下。我们训练之前最好检查一下生成的jsonl文件是否对。

3、finetune脚本配置确认:(对比官方,有修改:打印更具体log、对应具体train和val路径、train_ds.py脚本具体位置,以及batch_size等具体参数配置)

#!/bin/bash
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)

workspace=`pwd`

# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')

# model_name from model_hub, or model_dir in local path
## option 1, download model automatically
model_name_or_model_dir="iic/SenseVoiceSmall"

## option 2, download model by git (若自动下载失败,可取消注释使用)
# local_path_root=${workspace}/modelscope_models
# mkdir -p ${local_path_root}/${model_name_or_model_dir}
# git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
# model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}


# data dir, which contains: train.jsonl, val.jsonl
train_data=${workspace}/data/train-ppc.jsonl
val_data=${workspace}/data/val-ppc.jsonl

# exp output dir (日志、模型、错误文件均放在此目录)
output_dir="./outputs"
log_file="${output_dir}/log.txt"

# Deepspeed配置(仅当 use_deepspeed=true 时生效,当前已禁用)
deepspeed_config=${workspace}/../../ds_stage1.json

# 创建输出目录(确保错误日志、训练日志有目录可存)
mkdir -p ${output_dir}
echo "log_file: ${log_file}"

# 分布式训练参数(多GPU配置)
DISTRIBUTED_ARGS="
    --nnodes ${WORLD_SIZE:-1} \
    --nproc_per_node $gpu_num \
    --node_rank ${RANK:-0} \
    --master_addr ${MASTER_ADDR:-127.0.0.1} \
    --master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS

# funasr trainer path (定位训练脚本)
train_tool="/root/miniconda3/envs/funasr/lib/python3.10/site-packages/funasr/bin/train_ds.py"

# -------------------------- 新增:错误日志配置 --------------------------
# TORCHELASTIC_ERROR_FILE:让每个GPU进程的详细报错堆栈写入独立JSON文件
# %r 会自动替换为进程rank(如0、1,对应GPU 0、1),避免日志冲突
export TORCHELASTIC_ERROR_FILE="${output_dir}/elastic_error_%r.json"
# ------------------------------------------------------------------------

# 启动训练(包含错误日志输出)
torchrun $DISTRIBUTED_ARGS \
${train_tool} \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=10000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=10 \
++train_conf.avg_nbest_model=5 \
++train_conf.use_deepspeed=false \
# 修复:因 use_deepspeed=false,注释掉无用的 deepspeed_config(避免无效参数警告)
# ++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}

备注:如何找到具体安装的train_ds.py的路径脚本:

#!/bin/bash
echo "========================================"
echo "  检查 FunASR 安装状态与位置"
echo "========================================"

# 1. 先显示当前 Python 环境(避免查错环境)
echo -e "\n[1] 当前 Python 环境信息:"
which python3
python3 --version

# 2. 检查 pip 是否可用(确保用当前 Python 对应的 pip)
if ! command -v pip3 &> /dev/null; then
    echo -e "\n错误:未找到 pip3,请先安装 pip 或确保 pip3 在环境变量中!"
    exit 1
fi
echo -e "\n[2] 当前 pip 环境信息:"
which pip3
pip3 --version

# 3. 检查 FunASR 是否安装
echo -e "\n[3] 检查 FunASR 是否安装:"
if pip3 list | grep -qi "funasr"; then
    echo -e "✅ FunASR 已安装!"
    
    # 4. 查看 FunASR 详细信息(含版本、安装位置)
    echo -e "\n[4] FunASR 详细安装信息:"
    pip3 show funasr | grep -E "Name|Version|Location|Editable project location"
    
    # 5. 额外:定位 funasr 命令的路径(若需调用 CLI 工具)
    echo -e "\n[5] FunASR 命令行工具(funasr)路径:"
    if command -v funasr &> /dev/null; then
        which funasr
    else
        echo "⚠️  未找到 funasr 命令行工具(可能仅安装了库,未安装 CLI;或环境变量未配置)"
    fi
else
    echo -e "❌ FunASR 未安装!"
    echo -e "   建议安装命令:"
    echo -e "   - 从 PyPI 安装:pip3 install -U funasr"
    echo -e "   - 从源码安装:git clone https://github.com/alibaba/FunASR.git && cd FunASR && pip3 install -e ./"
fi

echo -e "\n========================================"

 

Logo

更多推荐