Facing difficulty while fine tuning speech recognition model in local pc

I have successfully fine tuned a voice recognition model on Google Colab. But when I try to use the same code to fine tune my model on my local PC it gives me the following error message.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Input In [85], in <cell line: 1>()
----> 1 trainer.train()

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1316, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1314         tr_loss_step = self.training_step(model, inputs)
   1315 else:
-> 1316     tr_loss_step = self.training_step(model, inputs)
   1318 if (
   1319     args.logging_nan_inf_filter
   1320     and not is_torch_tpu_available()
   1321     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   1322 ):
   1323     # if loss is nan or inf simply add the average of previous logged losses
   1324     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1849, in Trainer.training_step(self, model, inputs)
   1847         loss = self.compute_loss(model, inputs)
   1848 else:
-> 1849     loss = self.compute_loss(model, inputs)
   1851 if self.args.n_gpu > 1:
   1852     loss = loss.mean()  # mean() to average on multi-gpu parallel training

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1881, in Trainer.compute_loss(self, model, inputs, return_outputs)
   1879 else:
   1880     labels = None
-> 1881 outputs = model(**inputs)
   1882 # Save past state if it exists
   1883 # TODO: this needs to be fixed and made cleaner later.
   1884 if self.args.past_index >= 0:

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1494, in Wav2Vec2ForCTC.forward(self, input_values, attention_mask, output_attentions, output_hidden_states, return_dict, labels)
   1449 r"""
   1450 labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_length)`, `optional`):
   1451     Labels for connectionist temporal classification. Note that ``target_length`` has to be smaller or equal to
   (...)
   1489     >>> loss = model(input_values, labels=labels).loss
   1490 """
   1492 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1494 outputs = self.wav2vec2(
   1495     input_values,
   1496     attention_mask=attention_mask,
   1497     output_attentions=output_attentions,
   1498     output_hidden_states=output_hidden_states,
   1499     return_dict=return_dict,
   1500 )
   1502 hidden_states = outputs[0]
   1503 hidden_states = self.dropout(hidden_states)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1064, in Wav2Vec2Model.forward(self, input_values, attention_mask, mask_time_indices, output_attentions, output_hidden_states, return_dict)
   1059 output_hidden_states = (
   1060     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
   1061 )
   1062 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1064 extract_features = self.feature_extractor(input_values)
   1065 extract_features = extract_features.transpose(1, 2)
   1067 if attention_mask is not None:
   1068     # compute reduced attention_mask corresponding to feature vectors

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:337, in Wav2Vec2FeatureExtractor.forward(self, input_values)
    335 hidden_states = input_values[:, None]
    336 for conv_layer in self.conv_layers:
--> 337     hidden_states = conv_layer(hidden_states)
    339 return hidden_states

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:258, in Wav2Vec2GroupNormConvLayer.forward(self, hidden_states)
    257 def forward(self, hidden_states):
--> 258     hidden_states = self.conv(hidden_states)
    259     hidden_states = self.layer_norm(hidden_states)
    260     hidden_states = self.activation(hidden_states)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:302, in Conv1d.forward(self, input)
    301 def forward(self, input: Tensor) -> Tensor:
--> 302     return self._conv_forward(input, self.weight, self.bias)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:298, in Conv1d._conv_forward(self, input, weight, bias)
    294 if self.padding_mode != 'zeros':
    295     return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    296                     weight, bias, self.stride,
    297                     _single(0), self.dilation, self.groups)
--> 298 return F.conv1d(input, weight, bias, self.stride,
    299                 self.padding, self.dilation, self.groups)

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

Can you tell me where is the problem?

Looks to me like your model’s weights are on GPU but the input is not. What is your training code?

Here is my training notebook. can you please check and suggest me some solution?

https://github.com/iftekherhossain/Bangla-Voice-Recognition/blob/master/Bangla_voice_train.ipynb

I’m not experienced with audio so I am afraid I cannot help you here.