removed fp16 / mixed precision, minor fixes

IAHispano · Jan 7, 2025 · 837b945 · 837b945
1 parent a32d84e
commit 837b945
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 135 deletions.
diff --git a/core.py b/core.py
@@ -519,7 +519,7 @@ def run_train_script(
 
         if custom_pretrained == False:
             pg, pd = pretrained_selector(
-                str(vocoder), True, int(sample_rate)
+                str(vocoder), int(sample_rate)
             )
         else:
             if g_pretrained_path is None or d_pretrained_path is None:

diff --git a/rvc/configs/config.py b/rvc/configs/config.py
@@ -25,7 +25,6 @@ def get_instance(*args, **kwargs):
 class Config:
     def __init__(self):
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        self.is_half = self.device != "cpu"
         self.gpu_name = (
             torch.cuda.get_device_name(int(self.device.split(":")[-1]))
             if self.device.startswith("cuda")
@@ -82,13 +81,9 @@ def device_config(self):
             self.set_cuda_config()
         else:
             self.device = "cpu"
-            self.is_half = False
-            self.set_precision("fp32")
 
         # Configuration for 6GB GPU memory
-        x_pad, x_query, x_center, x_max = (
-            (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
-        )
+        x_pad, x_query, x_center, x_max = (1, 6, 38, 41)
         if self.gpu_mem is not None and self.gpu_mem <= 4:
             # Configuration for 5GB GPU memory
             x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
@@ -98,19 +93,10 @@ def device_config(self):
     def set_cuda_config(self):
         i_device = int(self.device.split(":")[-1])
         self.gpu_name = torch.cuda.get_device_name(i_device)
-        low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
-        if (
-            any(gpu in self.gpu_name for gpu in low_end_gpus)
-            and "V100" not in self.gpu_name.upper()
-        ):
-            self.is_half = False
-            self.set_precision("fp32")
-
         self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
             1024**3
         )
 
-
 def max_vram_gpu(gpu):
     if torch.cuda.is_available():
         gpu_properties = torch.cuda.get_device_properties(gpu)
@@ -119,7 +105,6 @@ def max_vram_gpu(gpu):
     else:
         return "8"
 
-
 def get_gpu_info():
     ngpu = torch.cuda.device_count()
     gpu_infos = []

diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py
@@ -70,12 +70,7 @@ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
             embedder_model_custom (str): Path to the custom HuBERT model.
         """
         self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
-        self.hubert_model.to(self.config.device)
-        self.hubert_model = (
-            self.hubert_model.half()
-            if self.config.is_half
-            else self.hubert_model.float()
-        )
+        self.hubert_model = self.hubert_model.to(self.config.device).float()
         self.hubert_model.eval()
 
     @staticmethod
@@ -482,13 +477,12 @@ def setup_network(self):
                 *self.cpt["config"],
                 use_f0=self.use_f0,
                 text_enc_hidden_dim=self.text_enc_hidden_dim,
-                is_half=False,
                 vocoder=self.vocoder,
             )
             del self.net_g.enc_q
             self.net_g.load_state_dict(self.cpt["weight"], strict=False)
-            self.net_g.eval().to(self.config.device)
-            self.net_g = self.net_g.float()
+            self.net_g = self.net_g.to(self.config.device).float()
+            self.net_g.eval()
 
     def setup_vc_instance(self):
         """

diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py
@@ -133,7 +133,6 @@ def __init__(self, tgt_sr, config):
         self.x_query = config.x_query
         self.x_center = config.x_center
         self.x_max = config.x_max
-        self.is_half = config.is_half
         self.sample_rate = 16000
         self.window = 160
         self.t_pad = self.sample_rate * self.x_pad
@@ -208,7 +207,6 @@ def __init__(self, tgt_sr, config):
         self.note_dict = self.autotune.note_dict
         self.model_rmvpe = RMVPE0Predictor(
             os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
-            is_half=self.is_half,
             device=self.device,
         )
 
@@ -440,11 +438,7 @@ def voice_conversion(
         with torch.no_grad():
             pitch_guidance = pitch != None and pitchf != None
             # prepare source audio
-            feats = (
-                torch.from_numpy(audio0).half()
-                if self.is_half
-                else torch.from_numpy(audio0).float()
-            )
+            feats = torch.from_numpy(audio0).float()
             feats = feats.mean(-1) if feats.dim() == 2 else feats
             assert feats.dim() == 1, feats.dim()
             feats = feats.view(1, -1).to(self.device)
@@ -498,12 +492,10 @@ def voice_conversion(
 
     def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
         npy = feats[0].cpu().numpy()
-        npy = npy.astype("float32") if self.is_half else npy
         score, ix = index.search(npy, k=8)
         weight = np.square(1 / score)
         weight /= weight.sum(axis=1, keepdims=True)
         npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
-        npy = npy.astype("float16") if self.is_half else npy
         feats = (
             torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
             + (1 - index_rate) * feats

diff --git a/rvc/lib/predictors/F0Extractor.py b/rvc/lib/predictors/F0Extractor.py
@@ -79,7 +79,6 @@ def extract_f0(self):
         elif method == "rmvpe":
             model_rmvpe = RMVPE0Predictor(
                 os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
-                is_half=config.is_half,
                 device=config.device,
                 # hop_length=80
             )

diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py
@@ -344,7 +344,6 @@ class MelSpectrogram(torch.nn.Module):
     Extracts Mel-spectrogram features from audio.
 
     Args:
-        is_half (bool): Whether to use half-precision floating-point numbers.
         n_mel_channels (int): Number of Mel-frequency bands.
         sample_rate (int): Sampling rate of the audio.
         win_length (int): Length of the window function in samples.
@@ -357,7 +356,6 @@ class MelSpectrogram(torch.nn.Module):
 
     def __init__(
         self,
-        is_half,
         n_mel_channels,
         sample_rate,
         win_length,
@@ -386,7 +384,6 @@ def __init__(
         self.sample_rate = sample_rate
         self.n_mel_channels = n_mel_channels
         self.clamp = clamp
-        self.is_half = is_half
 
     def forward(self, audio, keyshift=0, speed=1, center=True):
         factor = 2 ** (keyshift / 12)
@@ -416,8 +413,6 @@ def forward(self, audio, keyshift=0, speed=1, center=True):
                 magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
             magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
         mel_output = torch.matmul(self.mel_basis, magnitude)
-        if self.is_half:
-            mel_output = mel_output.half()
         log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
         return log_mel_spec
 
@@ -428,24 +423,19 @@ class RMVPE0Predictor:
 
     Args:
         model_path (str): Path to the RMVPE0 model file.
-        is_half (bool): Whether to use half-precision floating-point numbers.
         device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
     """
 
-    def __init__(self, model_path, is_half, device=None):
+    def __init__(self, model_path, device=None):
         self.resample_kernel = {}
         model = E2E(4, 1, (2, 2))
         ckpt = torch.load(model_path, map_location="cpu")
         model.load_state_dict(ckpt)
         model.eval()
-        if is_half:
-            model = model.half()
         self.model = model
         self.resample_kernel = {}
-        self.is_half = is_half
         self.device = device
-        self.mel_extractor = MelSpectrogram(
-            is_half, N_MELS, 16000, 1024, 160, None, 30, 8000
+        self.mel_extractor = MelSpectrogram(N_MELS, 16000, 1024, 160, None, 30, 8000
         ).to(device)
         self.model = self.model.to(device)
         cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
@@ -491,8 +481,6 @@ def infer_from_audio(self, audio, thred=0.03):
         mel = self.mel_extractor(audio, center=True)
         hidden = self.mel2hidden(mel)
         hidden = hidden.squeeze(0).cpu().numpy()
-        if self.is_half == True:
-            hidden = hidden.astype("float32")
         f0 = self.decode(hidden, thred=thred)
         return f0
 

diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py
@@ -105,7 +105,6 @@ def process_files(self, files, f0_method, hop_length, device, threads):
         if f0_method == "rmvpe":
             self.model_rmvpe = RMVPE0Predictor(
                 os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
-                is_half=False,
                 device=device,
             )
 
@@ -146,15 +145,15 @@ def run_pitch_extraction(files, devices, f0_method, hop_length, threads):
 def process_file_embedding(
     files, embedder_model, embedder_model_custom, device_num, device, n_threads
 ):
-    dtype = torch.float16 if (config.is_half and "cuda" in device) else torch.float32
-    model = load_embedding(embedder_model, embedder_model_custom).to(dtype).to(device)
+    model = load_embedding(embedder_model, embedder_model_custom).to(device).float()
+    model.eval()
     n_threads = max(1, n_threads)
 
     def worker(file_info):
         wav_file_path, _, _, out_file_path = file_info
         if os.path.exists(out_file_path):
             return
-        feats = torch.from_numpy(load_audio(wav_file_path, 16000)).to(dtype).to(device)
+        feats = torch.from_numpy(load_audio(wav_file_path, 16000)).to(device).float()
         feats = feats.view(1, -1)
         with torch.no_grad():
             result = model(feats)["last_hidden_state"]