另一个微调文章推荐:https://datacanvas.csdn.net/68e8c2634b11580edfa2b690.html?spm=1001.2101.3001.6650.6&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7Eactivity-6-149087508-blog-147157745.235%5Ev43%5Epc_blog_bottom_relevance_base9&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7Eactivity-6-149087508-blog-147157745.235%5Ev43%5Epc_blog_bottom_relevance_base9&utm_relevant_index=13

原始推理脚本:

本地:

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)

import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # 强制仅可见 GPU 0
os.environ["WORLD_SIZE"] = "1"
import torch
import argparse
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
from functools import partial
from tqdm import tqdm

def init_model():
    """初始化单 GPU 模型"""
    global model
    print(f"Using GPU: {os.environ['CUDA_VISIBLE_DEVICES']}")
    
    # 初始化模型
    model_dir = "/data/。。。/1_code_model/funasr/FunASR/examples/industrial_data_pretraining/sense_voice/outputs_finetune2"
    model = AutoModel(
        model=model_dir,
        trust_remote_code=True,
        vad_model="fsmn-vad",
        vad_kwargs={"max_single_segment_time": 30000},
        device="cuda:0",  # 强制使用 GPU 0
        disable_update=True
    )

def process_audio(line, output_file):
    """处理单行输入数据并写入到输出文件"""
    try:
        id_, audio_path = line.strip().split(' ', 1)
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        # 生成音频识别文本
        res = model.generate(
            input=audio_path,
            cache={},
#             language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
            language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
            use_itn=True,
#             use_itn=False,
            batch_size_s=80,
            merge_vad=True,
            merge_length_s=15,
            ban_emo_unk=False,
        )
        text = rich_transcription_postprocess(res[0]["text"])

        # 结果直接追加到输出文件中
        with open(output_file, 'a', encoding='utf-8') as f_out:
            f_out.write(f"{id_} {text}\n")

        print(f"Processed audio: {audio_path} | Result: {text}")
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

def main(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # 单进程处理
    init_model()
    for line in tqdm(lines, desc="Processing audios"):
        process_audio(line, output_file)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run voice recognition using single GPU.")
    parser.add_argument('input_file', type=str, help='Path to input file containing audio paths and IDs.')
    parser.add_argument('output_file', type=str, help='Path to output file for recognized texts.')
    args = parser.parse_args()

    # 清空输出文件
    open(args.output_file, 'w').close()

    main(args.input_file, args.output_file)
    



线上:

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)

import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # 强制仅可见 GPU 0
os.environ["WORLD_SIZE"] = "1"
import torch
import argparse
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
from functools import partial
from tqdm import tqdm

def init_model():
    """初始化单 GPU 模型"""
    global model
    print(f"Using GPU: {os.environ['CUDA_VISIBLE_DEVICES']}")
    
    # 初始化模型
    model_dir = "iic/SenseVoiceSmall"
    model = AutoModel(
        model=model_dir,
        trust_remote_code=True,
        vad_model="fsmn-vad",
        vad_kwargs={"max_single_segment_time": 30000},
        device="cuda:0",  # 强制使用 GPU 0
    )

def process_audio(line, output_file):
    """处理单行输入数据并写入到输出文件"""
    try:
        id_, audio_path = line.strip().split(' ', 1)
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        # 生成音频识别文本
        res = model.generate(
            input=audio_path,
            cache={},
#             language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
            language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
            use_itn=True,
#             use_itn=False,
            batch_size_s=80,
            merge_vad=True,
            merge_length_s=15,
            ban_emo_unk=False,
        )
        text = rich_transcription_postprocess(res[0]["text"])

        # 结果直接追加到输出文件中
        with open(output_file, 'a', encoding='utf-8') as f_out:
            f_out.write(f"{id_} {text}\n")

        print(f"Processed audio: {audio_path} | Result: {text}")
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

def main(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # 单进程处理
    init_model()
    for line in tqdm(lines, desc="Processing audios"):
        process_audio(line, output_file)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run voice recognition using single GPU.")
    parser.add_argument('input_file', type=str, help='Path to input file containing audio paths and IDs.')
    parser.add_argument('output_file', type=str, help='Path to output file for recognized texts.')
    args = parser.parse_args()

    # 清空输出文件
    open(args.output_file, 'w').close()

    main(args.input_file, args.output_file)
    


Logo

更多推荐