
    i3                         d dl Zd dlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlmZ  e	       rd dlZdZdZ e
j                   e      Z ed	
       G d de             ZdgZy)    N   )SequenceFeatureExtractor)BatchFeature)
TensorTypeis_librosa_availablelogging)requiresgh㈵>g      p>)torchlibrosa)backendsc                       e Zd ZdZddgZ	 	 	 	 	 	 	 d fd	ZddZ	 	 	 	 	 	 	 	 	 	 ddej                  e	e
   z  e	ej                     z  e	e	e
      z  ded	edz  d
eez  dz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  defdZ xZS )ParakeetFeatureExtractora  
    Constructs a Parakeet feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        n_fft (`int`, *optional*, defaults to 512):
            Size of the Fourier transform.
        win_length (`int`, *optional*, defaults to 400):
            The window length for the STFT computation.
        preemphasis (`float`, *optional*, defaults to 0.97):
            A preemphasis filter coefficient. 0.0 means no preemphasis filter.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
    input_featuresattention_maskc                 "   t        
|   d|||d| || _        || _        || _        || _        t        j                  j                  |||d|dz  d      }	t        j                  |	      j                  t        j                        | _        y )N)feature_sizesampling_ratepadding_value           slaney)srn_fftn_melsfminfmaxnorm )super__init__
hop_lengthr   
win_lengthpreemphasisr   filtersmelr
   
from_numpytofloat32mel_filters)selfr   r   r!   r   r"   r#   r   kwargsr)   	__class__s             t/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/parakeet/feature_extraction_parakeet.pyr    z!ParakeetFeatureExtractor.__init__A   s     	wl-_lwpvw$
$& oo))E,S}_`O`go * 
 !++K8;;EMMJ    c           	         t        j                  | j                  d|      }t        j                  || j                  | j
                  | j                  |dd      }t        j                  |      }t        j                  |j                  d      j                  d            }|j                  d      }| j                  j                  |      }||z  }t        j                  |t        z         }|j                  ddd	      }|S )
NF)periodicdeviceTconstant)r!   r"   windowreturn_complexpad_moder   r      )r
   hann_windowr"   stftr   r!   view_as_realsqrtpowsumr)   r'   logLOG_ZERO_GUARD_VALUEpermute)r*   waveformr1   r3   r9   
magnitudesr)   mel_specs           r-   _torch_extract_fbank_featuresz6ParakeetFeatureExtractor._torch_extract_fbank_featuresc   s    ""4??U6RzzJJ
 ''-
ZZ
q 1 5 5b 9:
^^A&
 &&))&1+99X(<<= ##Aq!,r.   N
raw_speech
truncationpad_to_multiple_ofreturn_tensorsreturn_attention_maskpadding
max_lengthr   do_normalizer1   return_token_timestampsreturnc                 	   |O|| j                   k7  rmt        d| j                  j                   d| j                    d| j                    d| d	      t        j                  d| j                  j                   d       t        |t        j                        rt        j                  |      }nUt        |t        t        f      r?t        |d	   t        j                        r"|D cg c]  }t        j                  |       }}t        |t        j                        xr t        |j                        d
kD  }|rVt        |j                        dkD  r>t        j                  d| j                  j                   d       |j!                  d      }t        |t        t        f      }|r^|D ]Y  }t        |j                        d
kD  st        j                  d| j                  j                   d       |j!                  d      }[ |s|r4|D cg c](  }|dddf   j#                  t        j$                        * }}n'|dddf   j#                  t        j$                        g}|D cg c]  }t        |       }}t'        ||d      }| j)                  |||||d      }|j*                  j-                  d      }| j.                  t        j0                  |j                  d
   |j2                        j5                  d	      |j6                  j5                  d
      k  }t        j8                  |dddd
f   |ddd
df   | j.                  |ddddf   z  z
  gd
      }|j;                  | d      }| j=                  ||
      }t        j>                  |j6                  | j@                  dz  dz  z   | j@                  z
  | jB                        }t        j0                  |j                  d
   |
      dddf   |dddf   k  }|j5                  d      }||z  }|jE                  d
      |j5                  d      z  }|j5                  d
      }||z
  dz  |z  jE                  d
      |d
z
  j5                  d      z  }t        jF                  |      j5                  d
      }||z
  |tH        z   z  }||z  }t'        ||d|      S c c}w c c}w c c}w )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r7   r   z2Only mono-channel audio is supported for input to z;. We will take the mean of the channels to convert to mono.r6   )r   audio_lengthspt)rJ   rK   rF   rG   rH   )r1   )dimr   )r   r   )datatensor_type)%r   
ValueErrorr,   __name__loggerwarning
isinstancenpndarrayr
   tensorlisttupleTensorlenshapemeanr'   r(   r   padr   squeezer#   aranger1   	unsqueezerQ   catmasked_fillrD   floor_divider   r!   r=   r;   EPSILON)r*   rE   rF   rG   rH   rI   rJ   rK   r   rL   r1   rM   r+   speechis_batched_torchis_batched_sequencerQ   batched_speechpadded_inputsr   timemaskfeatures_lengthsr   maskinput_features_maskedrc   variancestds                               r-   __call__z!ParakeetFeatureExtractor.__call__   s   H $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  NNVW[WeWeWnWnVo p\ \ j"**-j1J
T5M2z*Q-QSQ[Q[7\=GH6%,,v.HJH%j%,,?]C
HXHXDY\]D]J$4$4 5 9NNDT^^E\E\D] ^L L $,J(dE]C$ -v||$q(NNLT^^MdMdLe fT T $[[_F- 2JTU&D/,,U]];UJU$QW-00?@J3=>V>>%Vc&de!!1 ! 
 '55==bA '||N$8$8$;NDYDYZdd++55a89H #YY2A2&q!"u(=@P@PSabcehfhehbhSi@i(ijpqN ,77	3GN;;NFS --''$**/A*==

JDOO
 n&:&:1&=fMdTUgVYijkmqjqYrr ''+ . 5$((Q(/2B2L2LR2PP~~a *T1a7$>CCCJN^abNbMmMmnpMqqjj",,Q/(4/C'MB$"0"0 '
 	
A I* V ?s   &S-S)S)P   i>     i   i  g
ףp=
?r   )cpu)
FNNNlongestNNNrz   N)rW   
__module____qualname____doc__model_input_namesr    rD   r[   r\   r^   floatboolintstrr   r   rw   __classcell__)r,   s   @r-   r   r   #   s4   4 *+;<  KD> !)-26-1'!%$($("/3[
JJe,tBJJ/??$tE{BSS[
 [
  $J	[

 j(4/[
  $d{[
 t[
 $J[
 Tz[
 Tk[
 d
[
 "&[
 
[
r.   r   )numpyr[   r
   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   utils.import_utilsr	   r   rk   r?   
get_loggerrW   rX   r   __all__r   r.   r-   <module>r      sx      I 4 > > *    
		H	% 
'(v
7 v
 )v
r &
&r.   