
    i\                     F   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  G d de,e      Z- G d de*      Z. G d de#      Z/ G d de"      Z0 G d dejb                        Z2 G d d e      Z3 G d! d"e      Z4 G d# d$e&      Z5 G d% d&e%      Z6 G d' d(e(      Z7 ed)*       G d+ d,e7             Z8 G d- d.e'      Z9g d/Z:y)0    N)Callable)	Tokenizer)Unigram)nn   )create_bidirectional_mask)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TokenizersBackend)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )LlamaAttentionLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)ParakeetCTCConfigParakeetEncoderConfig)ParakeetEncoderBlock ParakeetEncoderConvolutionModuleParakeetForCTCParakeetPreTrainedModel)ParakeetProcessor)T5Tokenizerc                   d     e Zd Z	 	 	 	 	 	 	 d	 fd	Z	 	 	 d
deee   z  dededz  dedef
dZ xZ	S )LasrTokenizerNc                     t        	|   d|||||||d| t        t        | j                  dd            | _        y )N)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensvocab
vocab_filer   F)unk_idbyte_fallback )super__init__r   r   _vocab_scores
_tokenizer)
selfr#   r$   r%   r&   r'   r(   r)   kwargs	__class__s
            a/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/lasr/modular_lasr.pyr.   zLasrTokenizer.__init__,   s[     	 		
&?!		
 		
 $""#
    	token_idsskip_special_tokensclean_up_tokenization_spacesgroup_tokensreturnc                     t        |t              r|g}|r%t        j                  |      D cg c]  }|d   	 }}|D cg c]  }|| j                  k7  s| }}t        j                  | f|||d|S c c}w c c}w )Nr   )r6   r7   r8   )
isinstanceint	itertoolsgroupbypad_token_idr   _decode)r1   r6   r7   r8   r9   r2   token_grouptokens           r4   rA   zLasrTokenizer._decodeI   s     i%"I;D;L;LY;WXKQXIX )2PuUd>O>O5OUP	P ((
 3)E	

 
 	
 Y Qs   A4A9A9)z</s>z<unk>z<pad>d   NNN)FNT)
__name__
__module____qualname__r.   r=   listboolstrrA   __classcell__r3   s   @r4   r!   r!   +   sl     "&
@ %*48!
c?
 "
 '+Tk	

 
 

r5   r!   c                       e Zd Zy)LasrProcessorNrE   rF   rG   r,   r5   r4   rN   rN   b   s    r5   rN   c                   \     e Zd ZdZddddddddd	d
ddddddddddddgddgddf fd	Z xZS )LasrEncoderConfiga  
    This is the configuration class to store the configuration of a [`LasrEncoder`]. It is used to instantiate a
    `LasrEncoder` model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
            hidden_size (`int`, *optional*, defaults to 512):
                Dimension of the layers and the hidden states.
            num_hidden_layers (`int`, *optional*, defaults to 17):
                Number of hidden layers in the Transformer encoder.
            num_attention_heads (`int`, *optional*, defaults to 8):
                Number of attention heads for each attention layer in the Transformer encoder.
            intermediate_size (`int`, *optional*, defaults to 2048):
                Dimension of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
            hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
                The non-linear activation function (function or string) in the encoder and pooler.
            attention_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in the attention layers.
            convolution_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in convolutions of the conformer's convolution module.
            conv_kernel_size (`int`, *optional*, defaults to 32):
                The kernel size of the convolution layers in the Conformer block.
            subsampling_conv_channels (`int`, *optional*, defaults to 256):
                The number of channels in the subsampling convolution layers.
            subsampling_conv_kernel_size (`int`, *optional*, defaults to 5):
                The kernel size of the subsampling convolution layers.
            subsampling_conv_stride (`int`, *optional*, defaults to 2):
                The stride of the subsampling convolution layers.
            num_mel_bins (`int`, *optional*, defaults to 128):
                Number of mel features.
            dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
            dropout_positions (`float`, *optional*, defaults to 0.0):
                The dropout ratio for the positions in the input sequence.
            layerdrop (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the layers in the encoder.
            activation_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for activations inside the fully connected layer.
            attention_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the attention layers.
            max_position_embeddings (`int`, *optional*, defaults to 10000):
                The maximum sequence length that this model might ever be used with.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            layer_norm_eps (`float`, *optional*, defaults to 1e-06):
                The epsilon used by the layer normalization layers.
            feed_forward_residual_weights (`tuple[float, float]`, *optional*, defaults to `[1.5, 0.5]`):
                The residual weights for the feed forward layers.
            conv_residual_weights (`tuple[float, float]`, *optional*, defaults to `[2.0, 1.0]`):
                The residual weights for the convolution layers.
            batch_norm_momentum (`float`, *optional*, defaults to 0.01):
                The momentum for the batch normalization layers.
            rope_parameters (`RopeParameters`, *optional*):
                Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
                a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
                with longer `max_position_embeddings`.

    Example:
        ```python
        >>> from transformers import LasrEncoderModel, LasrEncoderConfig

        >>> # Initializing a `LasrEncoder` configuration
        >>> configuration = LasrEncoderConfig()

        >>> # Initializing a model from the configuration
        >>> model = LasrEncoderModel(configuration)

        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```

    This configuration class is based on the LasrEncoder architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
             i   siluF          r      g?        i'  g{Gz?gư>g      ?g      ?g       @g      ?g{Gz?Nc                     || _         || _        || _        || _        || _        t        |   di d|d|d|d|d|d|d|d|d	|	d
|d|
d|d|d|d|d|d|d|d|| | `| `y )Nhidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_actattention_biasconvolution_biasconv_kernel_sizesubsampling_conv_channelsnum_mel_binssubsampling_conv_kernel_sizesubsampling_conv_stridedropoutdropout_positions	layerdropactivation_dropoutattention_dropoutmax_position_embeddingsinitializer_ranger,   )	rope_parameterslayer_norm_epsfeed_forward_residual_weightsconv_residual_weightsbatch_norm_momentumr-   r.   subsampling_factorscale_input)r1   r\   r]   r^   r_   r`   ra   rb   rc   rd   rf   rg   re   rh   ri   rj   rk   rl   rm   rn   rp   rq   rr   rs   ro   r2   r3   s                             r4   r.   zLasrEncoderConfig.__init__   s   8  /,-J*%:"#6  	
#	
/	
 !4	
 0		

 "	
 *	
 .	
 .	
 '@	
 &	
 *F	
 %<	
 	
 0	
  	
   2!	
" 0#	
$ %<%	
& 0)	
. #r5   )rE   rF   rG   __doc__r.   rK   rL   s   @r4   rQ   rQ   f   sf    K^ "%%& ! %'*Cj"Cj 3: :r5   rQ   c                   J     e Zd ZdZ	 	 	 	 	 ddeez  f fdZed        Z xZ	S )LasrCTCConfiga  
    This is the configuration class to store the configuration of a [`LasrForCTC`]. It is used to instantiate a
    Lasr CTC model according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.
    Args:
            vocab_size (`int`, *optional*, defaults to 512):
                Vocabulary size of the model.
            ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
                Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
                instance of [`LasrForCTC`].
            ctc_zero_infinity (`bool`, *optional*, defaults to `True`):
                Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
                occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
                of [`LasrForCTC`].
            encoder_config (`Union[dict, LasrEncoderConfig]`, *optional*):
                The config object or dictionary of the encoder.
            pad_token_id (`int`, *optional*, defaults to 0):
                Padding token id. Also used as blank token id.
    Example:
        ```python
        >>> from transformers import LasrForCTC, LasrCTCConfig
        >>> # Initializing a Lasr configuration
        >>> configuration = LasrCTCConfig()
        >>> # Initializing a model from the configuration
        >>> model = LasrForCTC(configuration)
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    This configuration class is based on the Lasr CTC architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
    encoder_configc           	      0    t        |   d|||||d| y )N)
vocab_sizectc_loss_reductionctc_zero_infinityry   r@   r,   )r-   r.   )r1   r{   r|   r}   ry   r@   r2   r3   s          r4   r.   zLasrCTCConfig.__init__  s0     	 	
!1/)%	
 	
r5   c                 4    | j                   j                  dz  S )Nr   )ry   rg   )r1   s    r4   inputs_to_logits_ratioz$LasrCTCConfig.inputs_to_logits_ratio%  s    ""::A==r5   )rR   meanTNr   )
rE   rF   rG   rv   dictrQ   r.   propertyr   rK   rL   s   @r4   rx   rx      sC    F !37

 00
$ > >r5   rx   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )LasrEncoderSubsamplingconfigc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j
                  |j                  |j                        | _
        t        j                  |j
                  |j                  |j                  |j                        | _        t        j                  |j                  |j
                        | _        t        j                         | _        y )N)kernel_sizestride)r-   r.   r   Linearre   r\   dense_0Conv1drf   rg   conv_0rd   conv_1dense_1ReLUact_fn)r1   r   r3   s     r4   r.   zLasrEncoderSubsampling.__init__+  s    yy!4!4f6H6HIii;;11	
 ii,,;;11	
 yy!A!A6CUCUVggir5   input_featuresr:   c                 ,   | j                  | j                  |            }|j                  dd      }| j                  | j                  |            }| j                  | j	                  |            }|j                  dd      }| j                  |      S )N   r   )r   r   	transposer   r   r   )r1   r   hidden_statess      r4   forwardzLasrEncoderSubsampling.forward=  sz    DLL$@A%//15DKK$>?DKK$>?%//15||M**r5   )	rE   rF   rG   rQ   r.   torchTensorr   rK   rL   s   @r4   r   r   *  s+     0  $+ell +u|| +r5   r   c                       e Zd Zy)LasrEncoderRotaryEmbeddingNrO   r,   r5   r4   r   r   F  s    r5   r   c                        e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   dz  dej                  dz  de	e
   d	eej                  ej                  f   f
d
Z xZS )LasrEncoderAttentionr   	layer_idxc                 4    t         |   ||       d| _        y )NF)r-   r.   	is_causalr1   r   r   r3   s      r4   r.   zLasrEncoderAttention.__init__J  s    +r5   Nr   position_embeddingsattention_maskr2   r:   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	|\  }
}t        |||
|      \  }}t        j                  | j                  j                  t              } || |||	|f| j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nr   r   rZ   )rh   scaling)shapehead_dimq_projviewr   k_projv_projr   r
   get_interfacer   _attn_implementationr   trainingrl   r   reshape
contiguouso_proj)r1   r   r   r   r2   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r4   r   zLasrEncoderAttention.forwardN  sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r5   NN)rE   rF   rG   rQ   r=   r.   r   r   tupler   r   r   rK   rL   s   @r4   r   r   I  s    0 S  IM.2	")||") #5<<#=>E") t+	")
 +,") 
u||U\\)	*")r5   r   c                   &     e Zd Zddef fdZ xZS )LasrEncoderConvolutionModuler   c                     t         |   ||       d| _        t        j                  |j
                  |j                        | _        y )Nsame)momentum)r-   r.   paddingr   BatchNorm1dr\   rs   norm)r1   r   module_configr3   s      r4   r.   z%LasrEncoderConvolutionModule.__init__t  s7    /NN6#5#5@Z@Z[	r5   N)rE   rF   rG   rQ   r.   rK   rL   s   @r4   r   r   s  s    \0 \ \r5   r   c                        e Zd Zdedef fdZ	 	 ddej                  dej                  dz  dej                  dz  dee	   d	ej                  f
d
Z
 xZS )LasrEncoderBlockr   r   c                 T   t         |   ||       |j                  | _        |j                  | _        t	        j
                  |j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j
                  |j                  |j                  d      | _
        t	        j
                  |j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _        y )NF)bias)r-   r.   rq   rr   r   	LayerNormr\   rp   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      r4   r.   zLasrEncoderBlock.__init__{  s    +-3-Q-Q*%+%A%A""$,,v/A/A6CXCX_d"e\\&*<*<f>S>SZ_`f&8&8&:O:OV[\"$,,v/A/A6CXCX_d"eV%7%79N9NUZ[r5   Nr   r   r   r2   r:   c                 0   |}| j                  | j                  |            }| j                  d   |z  | j                  d   |z  z   }| j                  |      } | j                  d|||d|\  }}||z   }| j                  | j                  |      |      }	| j                  d   |z  | j                  d   |	z  z   }|}| j                  | j                  |            }| j                  d   |z  | j                  d   |z  z   }| j                  |      }|S )Nr   r   )r   r   r   )r   r,   )feed_forward1r   rq   r   	self_attnconvr   rr   feed_forward2r   r   )
r1   r   r   r   r2   residualnormalized_hidden_statesr   _conv_outputs
             r4   r   zLasrEncoderBlock.forward  sJ    !**4+B+B=+QR..q1H<t?a?abc?dgt?tt 	 $(#5#5m#D ' 
2) 3
 	
Q &3ii} =ni]2215EHbHbcdHehsHss **4+B+B=+QR..q1H<t?a?abc?dgt?tt 	 m4r5   r   )rE   rF   rG   rQ   r=   r.   r   r   r   r   r   rK   rL   s   @r4   r   r   z  sw    
\0 
\S 
\ /337	!||! t+! #\\D0	!
 +,! 
!r5   r   c                   6    e Zd ZdZd Zdej                  fdZy)LasrPreTrainedModelFc                 .    t        j                  |       y r   )r   _init_weights)r1   modules     r4   r   z!LasrPreTrainedModel._init_weights  s    %%f-r5   input_lengthsc                     t        | j                  t              r| j                  j                  n| j                  }|j                  }|j
                  }d}t        |      D ]  }||z
  |z  dz   } |S )Nr   r   )r<   r   rx   ry   rf   rg   range)r1   r   ry   r   r   
num_layersr   s          r4   _get_subsampling_output_lengthz2LasrPreTrainedModel._get_subsampling_output_length  st    7A$++}7]33cgcncn$AA77
z" 	HA*[8VCaGM	H r5   N)rE   rF   rG   _supports_flex_attnr   r   r   r   r,   r5   r4   r   r     s    .	ELL 	r5   r   zh
    The LasrEncoder model, based on the Conformer architecture](https://arxiv.org/abs/2005.08100).
    )custom_introc                        e Zd ZU eed<   dZdef fdZeee	e
	 d
dej                  dej                  dz  dee   defd	                            Z xZS )LasrEncoderr   encoderc           	         t         |   |       d| _        |j                  | _        |j                  | _        |j
                  | _        t        |      | _        t        |      | _	        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                   |j"                  |j$                  d      | _        | j)                          y c c}w )NF)epsr   )r-   r.   gradient_checkpointingrh   ri   rj   r   
subsamplerr   
rotary_embr   
ModuleListr   r]   r   layersr   r\   rp   out_norm	post_initr   s      r4   r.   zLasrEncoder.__init__  s     &+#~~!'!9!9))084V<mmBGH`H`BabYfi0b
 V%7%7V=R=RY^_	 cs   C2Nr   r   r2   r:   c                 b   | j                  |      }| j                  |t        j                  |j                  d   |j
                        j                  d            \  }}t        j                  j                  || j                  | j                        }t        j                  j                  || j                  | j                        }t        j                  j                  || j                  | j                        }| | j                  ||j                  d         }t        | j                  ||      }| j                  D ]G  }d}| j                  r&t        j                   g       }	|	| j"                  k  rd}|r: ||f|||fd	|}I | j%                  |      }t'        |
      S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        r   )devicer   )pr   )target_length)r   inputs_embedsr   FT)r   r   )last_hidden_state)r   r   r   aranger   r   	unsqueezer   
functionalrh   r   ri   _get_output_attention_maskr   r   r   randrj   r   r	   )
r1   r   r   r2   r   r   r   encoder_layerto_dropdropout_probabilitys
             r4   r   zLasrEncoder.forward  s   > 7??5<<(;(;A(>}G[G[\ffghi
S --mt||VZVcVc-dmm##C4+A+ADMM#Zmm##C4+A+ADMM#Z%!<<^[h[n[nop[q<rN2;;')
 "[[ 	MG}}&+jjn#&7"G -!!#1),c
! 	!	  m4??r5   r   )rE   rF   rG   rQ   __annotations__base_model_prefixr.   r   r   r   r   r   r   r   r   r	   r   rK   rL   s   @r4   r   r     s     !0 "  /3?@?@ t+?@ +,	?@
 
?@     ?@r5   r   c                        e Zd Z fdZ xZS )
LasrForCTCc                  8     t               j                  di | S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = LasrForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        r,   )r-   generate)super_kwargsr3   s    r4   r  zLasrForCTC.generate  s    ,  uw/,//r5   )rE   rF   rG   r  rK   rL   s   @r4   r  r    s    0 0r5   r  )r  r   r   rN   rQ   rx   r!   );r>   collections.abcr   r   
tokenizersr   tokenizers.modelsr   r   masking_utilsr   modeling_outputsr	   modeling_utilsr
   r   processing_utilsr   tokenization_utils_tokenizersr   utilsr   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   r   parakeet.configuration_parakeetr   r   parakeet.modeling_parakeetr   r   r   r   parakeet.processing_parakeetr   t5.tokenization_t5r   r!   rN   rQ   rx   Moduler   r   r   r   r   r   r   r  __all__r,   r5   r4   <module>r     s    $    %  6 / F & > I I 7 5 v v V  = ,4
K!2 4
n	% 	H- HV6>% 6>r+RYY +8 <!5 ;')> ')T\#C \.+ .b1 & 
X@% X@
X@v0 04r5   