sensevoice_small专有名词针对性微调-推理评估部分
【代码】sensevoice_small专有名词针对性微调-推理评估部分。
·
原始推理脚本:
本地:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 强制仅可见 GPU 0
os.environ["WORLD_SIZE"] = "1"
import torch
import argparse
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
from functools import partial
from tqdm import tqdm
def init_model():
"""初始化单 GPU 模型"""
global model
print(f"Using GPU: {os.environ['CUDA_VISIBLE_DEVICES']}")
# 初始化模型
model_dir = "/data/。。。/1_code_model/funasr/FunASR/examples/industrial_data_pretraining/sense_voice/outputs_finetune2"
model = AutoModel(
model=model_dir,
trust_remote_code=True,
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
device="cuda:0", # 强制使用 GPU 0
disable_update=True
)
def process_audio(line, output_file):
"""处理单行输入数据并写入到输出文件"""
try:
id_, audio_path = line.strip().split(' ', 1)
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# 生成音频识别文本
res = model.generate(
input=audio_path,
cache={},
# language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
use_itn=True,
# use_itn=False,
batch_size_s=80,
merge_vad=True,
merge_length_s=15,
ban_emo_unk=False,
)
text = rich_transcription_postprocess(res[0]["text"])
# 结果直接追加到输出文件中
with open(output_file, 'a', encoding='utf-8') as f_out:
f_out.write(f"{id_} {text}\n")
print(f"Processed audio: {audio_path} | Result: {text}")
except Exception as e:
print(f"Error processing {audio_path}: {e}")
def main(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 单进程处理
init_model()
for line in tqdm(lines, desc="Processing audios"):
process_audio(line, output_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run voice recognition using single GPU.")
parser.add_argument('input_file', type=str, help='Path to input file containing audio paths and IDs.')
parser.add_argument('output_file', type=str, help='Path to output file for recognized texts.')
args = parser.parse_args()
# 清空输出文件
open(args.output_file, 'w').close()
main(args.input_file, args.output_file)
线上:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 强制仅可见 GPU 0
os.environ["WORLD_SIZE"] = "1"
import torch
import argparse
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
from functools import partial
from tqdm import tqdm
def init_model():
"""初始化单 GPU 模型"""
global model
print(f"Using GPU: {os.environ['CUDA_VISIBLE_DEVICES']}")
# 初始化模型
model_dir = "iic/SenseVoiceSmall"
model = AutoModel(
model=model_dir,
trust_remote_code=True,
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
device="cuda:0", # 强制使用 GPU 0
)
def process_audio(line, output_file):
"""处理单行输入数据并写入到输出文件"""
try:
id_, audio_path = line.strip().split(' ', 1)
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# 生成音频识别文本
res = model.generate(
input=audio_path,
cache={},
# language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
use_itn=True,
# use_itn=False,
batch_size_s=80,
merge_vad=True,
merge_length_s=15,
ban_emo_unk=False,
)
text = rich_transcription_postprocess(res[0]["text"])
# 结果直接追加到输出文件中
with open(output_file, 'a', encoding='utf-8') as f_out:
f_out.write(f"{id_} {text}\n")
print(f"Processed audio: {audio_path} | Result: {text}")
except Exception as e:
print(f"Error processing {audio_path}: {e}")
def main(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 单进程处理
init_model()
for line in tqdm(lines, desc="Processing audios"):
process_audio(line, output_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run voice recognition using single GPU.")
parser.add_argument('input_file', type=str, help='Path to input file containing audio paths and IDs.')
parser.add_argument('output_file', type=str, help='Path to output file for recognized texts.')
args = parser.parse_args()
# 清空输出文件
open(args.output_file, 'w').close()
main(args.input_file, args.output_file)
更多推荐
所有评论(0)