
    ik                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( e ed       G d de                    Z) G d dejT                        Z+ G d dejT                        Z, G d de"      Z- G d d e$      Z. G d! d"ejT                        Z/ G d# d$e      Z0e G d% d&e             Z1 ed'       G d( d)e1             Z2e G d* d+e             Z3 ed,       G d- d.e1             Z4g d/Z5y)0zPyTorch Parakeet model.    N)Callable)	dataclass)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )%FastSpeech2ConformerConvolutionModule)LlamaAttentioneager_attention_forward   )ParakeetCTCConfigParakeetEncoderConfigz
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   6    e Zd ZU dZej
                  dz  ed<   y)ParakeetEncoderModelOutputNattention_mask)__name__
__module____qualname__r    torchTensor__annotations__     i/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/parakeet/modular_parakeet.pyr   r   %   s     +/NELL4'.r(   r   c                        e Zd ZU dZej
                  ed<   ddef fdZ ej                         dej
                  fd       Z
 xZS )$ParakeetEncoderRelPositionalEncodingz*Relative positional encoding for Parakeet.inv_freqconfigc                 6   t         |           |j                  | _        d}d|t        j                  d|j
                  dt        j                        j                  |t        j                        |j
                  z  z  z  }| j                  d|d	       y )
N     @      ?r   r   dtype)devicer2   r,   F)
persistent)
super__init__max_position_embeddingsr$   arangehidden_sizeint64tofloatregister_buffer)selfr-   r3   baser,   	__class__s        r)   r6   z-ParakeetEncoderRelPositionalEncoding.__init__4   s    '-'E'E$Q 2 2AU[[ILLTZbgbmbmLn$$%
 	ZeDr(   hidden_statesc                    |j                   d   }|| j                  kD  rt        d| d| j                   d      t        j                  |dz
  | d|j
                        }| j                  d d d d f   j                         j                  |j                   d   dd      j                  |j
                        }|d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd	}t        |d
      5  |j                         |j                         z  j                  dd      }|j                         }|j!                         }	t        j"                  ||	gd      }
 |
j$                  g |
j                   d d d }
d d d        
j                  |j&                        S # 1 sw Y   %xY w)Nr   zSequence Length: z= has to be less or equal than config.max_position_embeddings .r3   r   mpscpuF)device_typeenabledr   dimr1   )shaper7   
ValueErrorr$   r8   r3   r,   r<   expandr;   
isinstancetypestrr   	transposesincosstackreshaper2   )r>   rA   
seq_lengthposition_idsinv_freq_expandedposition_ids_expandedrH   freqsrT   rU   	pos_embeds              r)   forwardz,ParakeetEncoderRelPositionalEncoding.forwardB   s   "((+
444#J< 02262N2N1OqR 
 ||JNZKML`L`aMM$4-(..0778K8KA8NPRTUVYYZgZnZno 	 !-T4] ; A A C -..33S9m>R>R>W>W[`>`   %% 	
 UC 	E&,,.1F1L1L1NNYYZ[]^_E))+C))+CS#JB7I)	))D9??3B+?DDI	E ||-"5"5|66	E 	Es   5BG%%G.N)r!   r"   r#   __doc__r$   r%   r&   r   r6   no_gradr^   __classcell__r@   s   @r)   r+   r+   /   sF    4llE4 E U]]_7U\\ 7 7r(   r+   c                   *     e Zd Zdef fdZd Z xZS )ParakeetEncoderFeedForwardr-   c                 `   t         |           t        j                  |j                  |j
                  |j                        | _        t        |j                     | _
        t        j                  |j
                  |j                  |j                        | _        |j                  | _        y )Nbias)r5   r6   r   Linearr9   intermediate_sizeattention_biaslinear1r   
hidden_act
activationlinear2activation_dropoutr>   r-   r@   s     r)   r6   z#ParakeetEncoderFeedForward.__init__b   s|    yy!3!3V5M5MTZTiTij !2!23yy!9!96;M;MTZTiTij"(";";r(   c                     | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }|S )Nptraining)rn   rl   r   
functionaldropoutrp   ru   ro   )r>   rA   s     r)   r^   z"ParakeetEncoderFeedForward.forwardi   sU    ](CD--mt?V?Vaeanan-o]3r(   )r!   r"   r#   r   r6   r^   rb   rc   s   @r)   re   re   a   s    <4 <r(   re   c                   &     e Zd Zddef fdZ xZS ) ParakeetEncoderConvolutionModuler-   c                 &    t         |   ||       y r_   )r5   r6   )r>   r-   module_configr@   s      r)   r6   z)ParakeetEncoderConvolutionModule.__init__q   s    /r(   r_   )r!   r"   r#   r   r6   rb   rc   s   @r)   ry   ry   p   s    04 0 0r(   ry   c                        e Zd ZdZdedef fdZ	 ddej                  dej                  dz  dej                  dz  d	e	e
   d
eej                  ej                  f   f
dZd Z xZS )ParakeetEncoderAttentionztMulti-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860.r-   	layer_idxc                    t         |   ||       d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  t        j                  |j                  | j                              | _        t        j                  t        j                  |j                  | j                              | _        y )N)r~   Frg   )r5   r6   	is_causalr   ri   r9   num_attention_headshead_dimrelative_k_proj	Parameterr$   zerosbias_ubias_vr>   r-   r~   r@   s      r)   r6   z!ParakeetEncoderAttention.__init__x   s    95!yy););V=W=WZ^ZgZg=gnstll5;;v/I/I4==#YZll5;;v/I/I4==#YZr(   NrA   position_embeddingsr    kwargsreturnc           
         |j                   d d }|\  }}||d| j                  f}| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }t        j                  | j                  j                  t              }|	| j                  j                  d| j                  j                  d| j                        z   }|	| j                  j                  d| j                  j                  d| j                        z   }| j                  |      }|j                  |d| j                  j                  | j                        }||j!                  dddd      z  }| j#                  |      }|dd |f   }|| j$                  z  }|)|j'                  |j)                         t+        d            } || f||
||| j,                  sdn| j.                  | j$                  d	|\  }} |j0                  g |d j3                         }| j5                  |      }||fS )
NrD   r   r   r   r   .z-inf        )querykeyvaluer    rw   scaling)rM   r   q_projviewrS   k_projv_projr   get_interfacer-   _attn_implementationr   r   r   r   r   permute
_rel_shiftr   masked_fill_logical_notr<   ru   attention_dropoutrW   
contiguouso_proj)r>   rA   r   r    r   input_shape
batch_sizerX   hidden_shapequery_states
key_statesvalue_statesattention_interfacequery_states_with_bias_uquery_states_with_bias_vrelative_key_states	matrix_bdattn_outputattn_weightss                      r)   r^   z ParakeetEncoderAttention.forward   sj    $))#2.!,
J"JDMMB{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST(?(M(MKK,,.E)
 $0$++2B2Bt{{..4==3
 $
  $0$++2B2Bt{{..4==3
 $
  #223FG166z2t{{GfGfhlhuhuv -/B/J/J1aQRTU/VV	OOI.	c;J;./	,	% "..~/I/I/KUSY][I %8	%
*$#}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r(   c                     |j                   \  }}}}t        j                  j                  |d      }|j	                  ||d|      }|ddddddf   j	                  ||||      }|S )ztRelative position shift for Shaw et al. style attention. See appendix B of https://huggingface.co/papers/1901.02860.)r   r   )padrD   Nr   )rM   r   rv   r   r   )r>   attention_scoresr   	num_headsquery_lengthposition_lengths         r)   r   z#ParakeetEncoderAttention._rel_shift   st    ?O?U?U<
I|_==,,-=6,J+00YLY+Aq!"H5:::yR^`opr(   r_   )r!   r"   r#   r`   r   intr6   r$   r%   r   r   tupler^   r   rb   rc   s   @r)   r}   r}   u   s    ~[4 [ [ /3	7)||7) #\\D07) t+	7)
 +,7) 
u||U\\)	*7)r r(   r}   c                        e Zd Zdef fdZdej                  dej                  fdZ	d	dej                  dej                  fdZ
 xZS )
 ParakeetEncoderSubsamplingConv2Dr-   c                    t         |           |j                  | _        |j                  | _        |j                  | _        | j                  dz
  dz  | _        t        t        j                  |j                              | _        t        j                         | _        | j                   j#                  t        j$                  d| j                  | j                  | j
                  | j                               | j                   j#                  t        j&                                t)        | j                  dz
        D ]  }| j                   j#                  t        j$                  | j                  | j                  | j                  | j
                  | j                  | j                               | j                   j#                  t        j$                  | j                  | j                  d             | j                   j#                  t        j&                                 |j*                  | j
                  | j                  z  z  }t        j,                  |j                  |z  |j.                  d      | _        y )Nr   r   )kernel_sizestridepadding)r   r   r   groupsr   Trg   )r5   r6   subsampling_conv_kernel_sizer   subsampling_conv_strider   subsampling_conv_channelschannelsr   r   mathlog2subsampling_factor
num_layersr   
ModuleListlayersappendConv2dReLUrangenum_mel_binsri   r9   linear)r>   r-   i
out_lengthr@   s       r)   r6   z)ParakeetEncoderSubsamplingConv2D.__init__   s   !>>4488((1,2dii(A(ABC mmoIIaD4D4DT[[bfbnbno	
 	2779%t*+ 	*AKK		MMMM $ 0 0;; LL==	 KKryySTUVKKrwwy)	*" ((T[[$//-IJ
ii @ @: MvOaOahlmr(   input_lengths
conv_layerc                     t        |d      rR|j                  dk7  rC|j                  }|j                  d   }|j                  d   }||d   z   |d   z   |z
  |z  dz   }|S |S )Nr   )r   r   r   r   )hasattrr   r   r   )r>   r   r   r   r   r   output_lengthss          r)   _get_output_lengthz3ParakeetEncoderSubsamplingConv2D._get_output_length   sx    :x(Z->->&-H ((G$003K&&q)F+gaj871:ESX^^abbN!!r(   input_featuresr    c                    |j                  d      }||j                  d      nd }| j                  D ]  } ||      }t        |t        j
                        s&|)| j                  ||      }|j                  d   }t        j                  ||j                        |d d d f   k  }||d d d d d d f   z  } |j                  dd      j                  |j                  d   |j                  d   d      }| j                  |      }|S )Nr   rD   r   rE   r   )	unsqueezesumr   rP   r   r   r   rM   r$   r8   r3   rS   rW   r   )r>   r   r    rA   current_lengthslayercurrent_seq_lengthchannel_masks           r)   r^   z(ParakeetEncoderSubsamplingConv2D.forward   s   &0034B4N.,,R0TX[[ 
	@E!-0M %+0J"&"9"9/5"Q%2%8%8%;"LL!3N<Q<QRUdefhlelUmm  aq$.>!??
	@ &//15==m>Q>QRS>TVcViVijkVlnpqM2r(   r_   )r!   r"   r#   r   r6   r$   r%   r   r   r   r^   rb   rc   s   @r)   r   r      sI    !n4 !nF	 	")) 	ell ELL r(   r   c                        e Zd Zddededz  f fdZ	 	 ddej                  dej                  dz  dej                  dz  dee	   d	ej                  f
d
Z
 xZS )ParakeetEncoderBlockNr-   r~   c                    t         |           d| _        t        |      | _        t        ||      | _        t        |      | _        t        |      | _	        t        j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        y NF)r5   r6   gradient_checkpointingre   feed_forward1r}   	self_attnry   convfeed_forward2r   	LayerNormr9   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      r)   r6   zParakeetEncoderBlock.__init__
  s    &+#7?1&)D4V<	7?"$,,v/A/A"B\\&*<*<=f&8&89"$,,v/A/A"BV%7%78r(   rA   r    r   r   r   c                 x   |}| j                  | j                  |            }|d|z  z   }| j                  |      } | j                  d|||d|\  }}||z   }| j	                  | j                  |      |      }	||	z   }| j                  | j                  |            }
|d|
z  z   }| j                  |      }|S )Ng      ?)rA   r    r   )r    r'   )	r   r   r   r   r   r   r   r   r   )r>   rA   r    r   r   residualnormalized_hidden_statesr   _conv_output
ff2_outputs              r)   r^   zParakeetEncoderBlock.forward  s     !**4+B+B=+QR 3#66#'#5#5m#D ' 
2) 3
 	
Q &3ii} =ni]%3''(?(?(NO
%j(88m4r(   r_   NN)r!   r"   r#   r   r   r6   r$   r%   r   r   r^   rb   rc   s   @r)   r   r   	  sx    94 9t 9$ /337	|| t+ #\\D0	
 +, 
r(   r   c                        e Zd ZU eed<   dZdZdZdZdgZ	dZ
dZdZdZdZdZeedZ ej(                          fd	       Zd
ej,                  fdZddej,                  dedz  fdZ xZS )ParakeetPreTrainedModelr-   modelr   audioTr   F)rA   
attentionsc                    t         |   |       t        | j                  d      r| j                  j                  }n%t        | j                  j                         dd      }t        |t              rEt        j                  |j                  d|       t        j                  |j                  d|       y t        |t              ryddt        j                  d| j                  j                   dt        j"                  	      | j                  j                   z  z  z  }t        j$                  |j&                  |       y y )
Ninitializer_rangeg{Gz?r   )meanstdr0   r/   r   r   r1   )r5   _init_weightsr   r-   r   getattrget_text_configrP   r}   initnormal_r   r   r+   r$   r8   r9   r:   copy_r,   )r>   moduler   r,   r@   s       r)   r   z%ParakeetPreTrainedModel._init_weightsN  s    f%4;; 34++//C $++5579LdSCf67LLSc:LLSc: DEELLDKK,C,CQekkZ]a]h]h]t]ttuH JJv1	 Fr(   r   c                    t        | j                  t              r| j                  j                  n| j                  }|j                  }|j
                  }t        t        j                  |j                              }|dz
  dz  dz  }||z
  }|}t        |      D ]Q  }	t        j                  |j                  t        j                        |z   |      dz   }t        j                  |      }S |j                  t        j                        S )Nr   r   r1   r0   )rP   r-   r   encoder_configr   r   r   r   r   r   r   r$   divr;   r<   floor)
r>   r   r  r   r   r   all_paddingsadd_padlengthsr   s
             r)   _get_subsampling_output_lengthz6ParakeetPreTrainedModel._get_subsampling_output_lengthb  s    7A$++O`7a33gkgrgr$AA77>#D#DEF
#aA-1,z" 	+Aii


 = GPSVVGkk'*G	+ zz		z**r(   Nr    target_lengthc                     | j                  |j                  d            }||n|j                         }t        j                  ||j
                        |dddf   k  }|S )z
        Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
        when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
        rD   NrE   )r	  r   maxr$   r8   r3   )r>   r    r
  r   
max_lengths        r)   _get_output_attention_maskz2ParakeetPreTrainedModel._get_output_attention_masks  sc    
 <<^=O=OPR=ST&3&?]^EWEWEY
j9N9NOR`abdhahRiir(   r_   )r!   r"   r#   r   r&   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr   r}   _can_record_outputsr$   ra   r   r%   r	  r   r  rb   rc   s   @r)   r   r   8  s    &O&*#/0$(!N !!"&-.
 U]]_2 2&+ELL +"	 	VY\`V` 	r(   r   z{
    The Parakeet Encoder model, based on the [Fast Conformer architecture](https://huggingface.co/papers/2305.05084).
    c                        e Zd ZU eed<   dZdef fdZeee	e
	 	 ddej                  dej                  dz  dedz  dee   d	ef
d
                            Z xZS )ParakeetEncoderr-   encoderc           	         t         |   |       || _        d| _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd| _        t        |      | _        t        |      | _        t!        j"                  t%        |j&                        D cg c]  }t)        ||       c}      | _        | j-                          y c c}w )NFr0   )r5   r6   r-   r   rw   dropout_positions	layerdropscale_inputr   sqrtr9   input_scaler   subsamplingr+   encode_positionsr   r   r   num_hidden_layersr   r   	post_initr   s      r)   r6   zParakeetEncoder.__init__  s     &+#~~!'!9!9))<B<N<N499V%7%78TW;FC DV LmmFKFLdLdFef!&)4f
 	 gs   
C:Nr   r    output_attention_maskr   r   c                    | j                  ||      }|| j                  z  }| j                  |      }t        j                  j                  || j
                  | j                        }t        j                  j                  || j                  | j                        }|u| j                  ||j                  d         }|j                  d      j                  d|j                  d   d      }||j                  dd      z  }|j                  d      }| j                  D ]E  }d}	| j                  r&t        j                  g       }
|
| j                   k  rd}	|	r: ||f||d	|}G t#        ||rj%                         
      S d
      S )aJ  
        output_attention_mask (`bool`, *optional*):
            Whether to return the output attention mask.

        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        rs   Nr   r
  rD   r   FT)r    r   )last_hidden_stater    )r$  r#  r%  r   rv   rw   ru   r  r  rM   r   rO   rS   r   r$   randr   r   r   )r>   r   r    r(  r   rA   r   output_maskencoder_layerto_dropdropout_probabilitys              r)   r^   zParakeetEncoder.forward  s   F ((H%(8(88"33MB--mt||VZVcVc-d mm334#9#9DMM 4 
 %99.XeXkXklmXn9oK(2215<<RATATUVAWY[\N+n.F.Fq!.LLN+55a8N![[ 	MG}}&+jjn#&7"G -!!#1(;! 	!	  *+QfKOO<M
 	
lp
 	
r(   r   )r!   r"   r#   r   r&   r  r6   r   r   r   r   r$   r%   boolr   r   r
   r^   rb   rc   s   @r)   r  r    s     "!!4 &  /3-1	@
@
 t+@
  $d{	@

 +,@
 
@
     @
r(   r  c                       e Zd ZU dZej
                  ed<   dZeej                     dz  ed<   dZ
eeej                        dz  ed<   dZeeej                        dz  ed<   y)ParakeetGenerateOutputal  
    Outputs of Parakeet models.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    	sequencesNlogitsr   rA   )r!   r"   r#   r`   r$   
LongTensorr&   r5  r   FloatTensorr   rA   r'   r(   r)   r3  r3    sm    & .2FE%##$t+29=JeE--./$6=<@M5u0012T9@r(   r3  zS
    Parakeet Encoder with a Connectionist Temporal Classification (CTC) head.
    c                   L    e Zd ZU eed<   def fdZee	 	 ddej                  dej                  dz  dej                  dz  de
e   def
d	              Z ej                         	 	 ddej                  dej                  dz  d
ede
e   deej"                  z  f
d       Z xZS )ParakeetForCTCr-   c                     t         |   |       t        |j                        | _        t        j                  |j                  j                  |j                  d      | _	        | j                          y )Nr   r   )r5   r6   r  r  r  r   Conv1dr9   
vocab_sizectc_headr'  rq   s     r)   r6   zParakeetForCTC.__init__  sS     &v'<'<=		&"7"7"C"CVEVEVdefr(   Nr   r    labelsr   r   c           
          | j                   d||d|}|j                  }| j                  |j                  dd            j                  dd      }d}|Y||n$t	        j
                  |t        j                        }| j                  |j                  d            }	|| j                  j                  k7  }
|
j                  d      }|j                  |
      }t        j                  j                  |dt        j                        j                  dd      }t        j                   j"                  j%                  d	
      5  t        j                  j'                  |||	|| j                  j                  | j                  j(                  | j                  j*                        }ddd       t-        |||j.                  |j0                        S # 1 sw Y   ,xY w)a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = ParakeetForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> outputs = model(**inputs)

        >>> print(outputs.loss)
        ```r   r    r   r   Nr1   rD   )rK   r2   r   F)rI   )blank	reductionzero_infinity)lossr5  rA   r   r'   )r  r+  r=  rS   r$   	ones_likelongr	  r   r-   pad_token_idmasked_selectr   rv   log_softmaxfloat32backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr   rA   r   )r>   r   r    r>  r   encoder_outputsrA   r5  rD  r   labels_masktarget_lengthsflattened_targets	log_probss                 r)   r^   zParakeetForCTC.forward  s   : '$,, 
))
 
 (99}66q!<=GG1M #1"<%//R`hmhrhrBs  !??@R@RSU@VWM !DKK$<$<<K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 )77&11	
 	
	 	s   A#GGreturn_dict_in_generatec                 H   d|d<    | j                   d	||d|}|j                  j                  d      }|:| j                  ||j                  d         }| j
                  j                  || <   |r-t        ||j                  |j                  |j                        S |S )
a3  
        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = ParakeetForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        Treturn_dictr@  rD   rJ   r   r*  )r4  r5  r   rA   r'   )
r^   r5  argmaxr  rM   r-   rG  r3  r   rA   )r>   r   r    rV  r   outputsr4  s          r)   generatezParakeetForCTC.generateV  s    : !%}".$,, #
))#
 #
 NN))b)1	 %!<<^[d[j[jkl[m<nN)-)A)AI~o&")#~~"--%33	  r(   r   r   )r!   r"   r#   r   r&   r6   r   r   r$   r%   r   r   r   r^   ra   r1  r3  r6  r[  rb   rc   s   @r)   r9  r9    s    0   /3&*	E
E
 t+E
 t#	E

 +,E
 
E
  E
N U]]_ /3(-	33 t+3 "&	3
 +,3 
 %"2"2	23 3r(   r9  )r9  r  r   )6r`   r   collections.abcr   dataclassesr   r$   r    r   r   activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   4fastspeech2_conformer.modeling_fastspeech2_conformerr   llama.modeling_llamar   r   configuration_parakeetr   r   r   Moduler+   re   ry   r}   r   r   r   r  r3  r9  __all__r'   r(   r)   <module>rl     sd     $ !   & ! 9 ? F & V V G 5 h J L 
/ / //7299 /7d 0'L 0
L ~ L ^Bryy BJ,5 ,^ Co C CL 
[
- [

[
| A[ A A4 
H, H
HV Kr(   