Spaces:

mason369
/

AI-RVC

Running

App Files Files Community

AI-RVC / lib /vocoder_fix.py

mason369

Upload folder using huggingface_hub

b6f9c90 verified 15 days ago

raw

history blame contribute delete

13.5 kB

	# -- coding: utf-8 --
	"""
	Vocoder伪影修复 - 针对呼吸音电音和长音撕裂
	基于RVC社区反馈和研究文献
	"""
	import numpy as np
	from scipy import signal
	from typing import Optional


	def fix_phase_discontinuity(audio: np.ndarray, sr: int, chunk_boundaries: Optional[list] = None) -> np.ndarray:
	"""
	修复相位不连续导致的撕裂

	参考: "Prosody-Guided Harmonic Attention for Phase-Coherent Neural Vocoding" (arXiv:2601.14472)
	Vocoder在长音时会产生相位不连续，导致撕裂

	Args:
	audio: 音频数据
	sr: 采样率
	chunk_boundaries: 分块边界位置（样本索引）

	Returns:
	修复后的音频
	"""
	# 使用希尔伯特变换提取瞬时相位
	analytic_signal = signal.hilbert(audio)
	instantaneous_phase = np.unwrap(np.angle(analytic_signal))
	amplitude = np.abs(analytic_signal)

	# 检测相位跳变
	phase_diff = np.diff(instantaneous_phase)
	phase_diff_threshold = np.percentile(np.abs(phase_diff), 99) * 2.5

	# 找到相位跳变点
	discontinuities = np.where(np.abs(phase_diff) > phase_diff_threshold)[0]

	if len(discontinuities) == 0:
	return audio

	# 修复每个不连续点
	result = audio.copy()
	phase_corrected = instantaneous_phase.copy()

	for disc_idx in discontinuities:
	# 计算相位跳变量
	phase_jump = phase_diff[disc_idx]

	# 在不连续点之后应用相位校正（累积补偿）
	correction_length = min(int(0.02 * sr), len(phase_corrected) - disc_idx - 1) # 20ms
	if correction_length > 0:
	# 线性过渡相位校正
	correction_curve = np.linspace(phase_jump, 0, correction_length)
	phase_corrected[disc_idx + 1:disc_idx + 1 + correction_length] -= correction_curve

	# 用校正后的相位重建信号
	corrected_signal = amplitude * np.exp(1j * phase_corrected)
	result = np.real(corrected_signal).astype(np.float32)

	return result


	def reduce_breath_electric_noise(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
	"""
	减少呼吸音中的电音

	参考: GitHub Issue #65 "Artefacting when speech has breath"
	问题: Vocoder在F0=0的区域会产生电子噪声

	Args:
	audio: 音频数据
	sr: 采样率
	f0: F0序列（可选，用于定位呼吸音）

	Returns:
	处理后的音频
	"""
	# 第一步：去除DC偏移和极低频噪声（0-80Hz）
	# 这是vocoder常见的低频泄漏问题
	from scipy import signal as scipy_signal

	# 设计高通滤波器：80Hz截止
	nyquist = sr / 2
	cutoff = 80 / nyquist

	# 使用4阶Butterworth高通滤波器
	sos = scipy_signal.butter(4, cutoff, btype='highpass', output='sos')
	audio = scipy_signal.sosfilt(sos, audio)

	# 第二步：检测和清理宽频噪声（原有逻辑）
	# 检测低能量区域（可能是呼吸音）
	frame_length = int(0.02 * sr) # 20ms
	hop_length = int(0.01 * sr) # 10ms

	n_frames = 1 + (len(audio) - frame_length) // hop_length

	# 计算每帧的能量和频谱平坦度
	energy = np.zeros(n_frames)
	spectral_flatness = np.zeros(n_frames)
	high_freq_ratio = np.zeros(n_frames) # 新增：高频能量占比

	for i in range(n_frames):
	start = i * hop_length
	end = start + frame_length
	if end > len(audio):
	break

	frame = audio[start:end]

	# 能量
	energy[i] = np.sum(frame ** 2)

	# 频谱平坦度（噪声特征）
	fft = np.abs(np.fft.rfft(frame))
	if np.sum(fft) > 1e-10:
	geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
	arithmetic_mean = np.mean(fft)
	spectral_flatness[i] = geometric_mean / (arithmetic_mean + 1e-10)

	# 计算高频能量占比（4kHz以上）
	freqs = np.fft.rfftfreq(len(frame), 1/sr)
	high_freq_mask = freqs >= 4000
	high_freq_energy = np.sum(fft[high_freq_mask] ** 2)
	total_freq_energy = np.sum(fft ** 2)
	high_freq_ratio[i] = high_freq_energy / (total_freq_energy + 1e-10)

	# 归一化能量
	energy_db = 10 * np.log10(energy + 1e-10)

	# 自适应底噪检测：
	# 1. 计算能量分布的统计特征
	# 2. 使用最低5%作为候选底噪区域
	# 3. 在候选区域中,根据频谱特征进一步筛选

	# 候选底噪区域：最低5%能量
	candidate_threshold = np.percentile(energy_db, 5)

	# 在候选区域中,检测真正的底噪
	# 底噪类型1：宽频噪声（频谱平坦度 > 0.35）
	# 底噪类型2：高频电流声（高频占比 > 0.15）
	is_candidate = energy_db < candidate_threshold
	is_wideband_noise = is_candidate & (spectral_flatness > 0.35)
	is_highfreq_noise = is_candidate & (high_freq_ratio > 0.15)

	# 合并两种类型的底噪
	is_noise = is_wideband_noise \| is_highfreq_noise

	# 如果检测到的底噪帧数太少(<1%),说明音频本身很纯净,不需要处理
	noise_ratio = is_noise.sum() / len(is_noise)
	if noise_ratio < 0.01:
	return audio

	# 如果提供了F0，使用F0=0来辅助判断
	if f0 is not None and len(f0) > 0:
	# F0对齐到音频帧
	f0_per_audio_frame = len(f0) / n_frames
	for i in range(n_frames):
	if not is_noise[i]:
	continue

	f0_idx = int(i * f0_per_audio_frame)
	if f0_idx < len(f0):
	# 如果F0>0，说明有音高，不是底噪
	if f0[f0_idx] > 0:
	is_noise[i] = False

	# 使用is_noise替代is_breath，更准确地描述我们要处理的内容
	is_breath = is_noise

	# 根据底噪比例动态调整清理强度
	# 底噪越多，说明vocoder质量越差，需要更激进的清理
	if noise_ratio < 0.05:
	# 底噪很少(1-5%)，温和清理
	spectral_threshold_percentile = 85 # 保留15%
	magnitude_attenuation = 0.2 # 衰减到20%
	mix_ratio = 0.5 # 50%清理
	elif noise_ratio < 0.15:
	# 底噪中等(5-15%)，中等清理
	spectral_threshold_percentile = 90 # 保留10%
	magnitude_attenuation = 0.1 # 衰减到10%
	mix_ratio = 0.7 # 70%清理
	else:
	# 底噪很多(>15%)，激进清理
	spectral_threshold_percentile = 95 # 保留5%
	magnitude_attenuation = 0.05 # 衰减到5%
	mix_ratio = 0.85 # 85%清理

	# 对底噪区域应用降噪
	result = audio.copy()

	for i in range(n_frames):
	if is_breath[i]:
	start = i * hop_length
	end = start + frame_length
	if end > len(audio):
	break

	# 使用频谱门限降噪
	frame = audio[start:end]

	# FFT
	fft = np.fft.rfft(frame)
	magnitude = np.abs(fft)
	phase = np.angle(fft)
	freqs = np.fft.rfftfreq(len(frame), 1/sr)

	# 检测这一帧是高频噪声还是宽频噪声
	high_freq_mask = freqs >= 4000
	high_freq_energy = np.sum(magnitude[high_freq_mask] ** 2)
	total_freq_energy = np.sum(magnitude ** 2)
	frame_high_ratio = high_freq_energy / (total_freq_energy + 1e-10)

	if frame_high_ratio > 0.15:
	# 高频电流声：专门衰减高频部分
	magnitude[high_freq_mask] *= 0.05 # 高频衰减到5%
	# 中频(1-4kHz)温和衰减
	mid_freq_mask = (freqs >= 1000) & (freqs < 4000)
	magnitude[mid_freq_mask] *= 0.3
	else:
	# 宽频噪声：使用原有的频谱门限
	threshold = np.percentile(magnitude, spectral_threshold_percentile)
	magnitude = np.where(magnitude > threshold, magnitude, magnitude * magnitude_attenuation)

	# 重建
	fft_cleaned = magnitude * np.exp(1j * phase)
	frame_cleaned = np.fft.irfft(fft_cleaned, n=len(frame))

	# 平滑过渡
	fade_length = min(hop_length // 2, len(frame) // 4)
	if fade_length > 0:
	fade_in = np.linspace(0, 1, fade_length)
	fade_out = np.linspace(1, 0, fade_length)

	frame_cleaned[:fade_length] *= fade_in
	frame_cleaned[-fade_length:] *= fade_out

	# 动态混合比例
	result[start:end] = frame * (1 - mix_ratio) + frame_cleaned * mix_ratio

	return result


	def stabilize_sustained_notes(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
	"""
	稳定长音，防止撕裂

	参考: "Mel Spectrogram Inversion with Stable Pitch" - Apple Research
	长音时vocoder容易产生相位漂移

	Args:
	audio: 音频数据
	sr: 采样率
	f0: F0序列（用于检测长音）

	Returns:
	稳定后的音频
	"""
	if f0 is None or len(f0) == 0:
	return audio

	# 检测长音区域（F0稳定且持续时间长）
	frame_length = int(0.02 * sr)
	hop_length = int(0.01 * sr)

	# F0对齐到音频帧
	n_audio_frames = 1 + (len(audio) - frame_length) // hop_length
	f0_per_audio_frame = len(f0) / n_audio_frames

	is_sustained = np.zeros(n_audio_frames, dtype=bool)

	# 检测F0稳定的区域
	window_size = 20 # 200ms窗口
	for i in range(window_size, n_audio_frames - window_size):
	f0_idx = int(i * f0_per_audio_frame)
	if f0_idx >= len(f0):
	break

	# 获取窗口内的F0
	f0_window_start = max(0, f0_idx - window_size)
	f0_window_end = min(len(f0), f0_idx + window_size)
	f0_window = f0[f0_window_start:f0_window_end]

	# 过滤F0=0
	f0_voiced = f0_window[f0_window > 0]

	if len(f0_voiced) > window_size * 0.8: # 80%有声
	# 计算F0稳定性
	f0_std = np.std(f0_voiced)
	f0_mean = np.mean(f0_voiced)

	# F0变化小于5%认为是长音
	if f0_std / (f0_mean + 1e-6) < 0.05:
	is_sustained[i] = True

	# 对长音区域应用相位稳定
	result = audio.copy()

	i = 0
	while i < n_audio_frames:
	if is_sustained[i]:
	# 找到长音区域的起止
	start_frame = i
	while i < n_audio_frames and is_sustained[i]:
	i += 1
	end_frame = i

	# 转换为样本索引
	start_sample = start_frame * hop_length
	end_sample = min(end_frame * hop_length + frame_length, len(audio))

	if end_sample - start_sample < frame_length:
	continue

	# 提取长音段
	sustained_segment = audio[start_sample:end_sample]

	# 使用低通滤波平滑幅度包络（而非除法）
	envelope = np.abs(signal.hilbert(sustained_segment))

	# 平滑包络
	b, a = signal.butter(2, 50 / (sr / 2), btype='low')
	smoothed_envelope = signal.filtfilt(b, a, envelope)

	# 计算增益调整（避免除法放大噪声）
	# 只在包络变化剧烈的地方应用平滑
	envelope_variation = np.abs(envelope - smoothed_envelope)
	variation_threshold = np.percentile(envelope_variation, 75)

	# 创建混合掩码：变化大的地方用平滑包络，变化小的地方保持原样
	blend_mask = np.clip(envelope_variation / (variation_threshold + 1e-6), 0, 1)

	# 计算目标包络
	target_envelope = smoothed_envelope * blend_mask + envelope * (1 - blend_mask)

	# 应用包络调整（使用乘法而非除法）
	if np.max(envelope) > 1e-6:
	gain = target_envelope / (envelope + 1e-6)
	# 限制增益范围，避免放大噪声
	gain = np.clip(gain, 0.5, 2.0)
	result[start_sample:end_sample] = sustained_segment * gain

	i += 1

	return result


	def apply_vocoder_artifact_fix(
	audio: np.ndarray,
	sr: int,
	f0: Optional[np.ndarray] = None,
	chunk_boundaries: Optional[list] = None,
	fix_phase: bool = True,
	fix_breath: bool = True,
	fix_sustained: bool = True
	) -> np.ndarray:
	"""
	应用完整的vocoder伪影修复

	Args:
	audio: 音频数据
	sr: 采样率
	f0: F0序列
	chunk_boundaries: 分块边界
	fix_phase: 是否修复相位不连续
	fix_breath: 是否修复呼吸音电音
	fix_sustained: 是否稳定长音

	Returns:
	修复后的音频
	"""
	result = audio.copy()

	# 1. 修复相位不连续（长音撕裂）
	if fix_phase:
	result = fix_phase_discontinuity(result, sr, chunk_boundaries)

	# 2. 减少呼吸音电音
	if fix_breath:
	result = reduce_breath_electric_noise(result, sr, f0)

	# 3. 稳定长音
	if fix_sustained:
	result = stabilize_sustained_notes(result, sr, f0)

	return result