
    2ic                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	mc m
Z ddlmZ ddlmZ ddlmZ dd	lmZmZ d
dlmZ ddlmZ e G d de             Ze G d de             Ze G d de             Z G d dej:                        Z G d dej:                        Z G d dej:                        Z  G d dej:                        Z! G d dej:                        Z" G d dej:                        Z# G d  d!ej:                        Z$ G d" d#ej:                        Z%e G d$ d%e             Z& ed&'       G d( d)e&             Z'd)d%gZ(y)*zTransformers Xcodec model.    N)	dataclass)	lru_cache   )initialization)conv1d_output_length)PreTrainedAudioTokenizerBase)ModelOutputauto_docstring   )	AutoModel   )XcodecConfigc                   b    e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   y)XcodecOutputao  
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
        audio_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`, *optional*)
            Decoded audio values obtained using the decoder part of Xcodec.
    Naudio_codesaudio_values)
__name__
__module____qualname____doc__r   torch
LongTensor__annotations__r   FloatTensor     f/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/xcodec/modeling_xcodec.pyr   r       s3     ,0K!!D(/-1L%##d*1r   r   c                   :    e Zd ZU dZdZej                  dz  ed<   y)XcodecEncoderOutputz
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
    Nr   )r   r   r   r   r   r   r   r   r   r   r   r   r   .   s     ,0K!!D(/r   r   c                   :    e Zd ZU dZdZej                  dz  ed<   y)XcodecDecoderOutputz
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, channels, num_samples)`, *optional*):
            Decoded audio values obtained using the decoder part of Xcodec.
    Nr   )r   r   r   r   r   r   r   r   r   r   r   r!   r!   9   s     .2L%##d*1r   r!   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z	 xZ
S )
ResidualUnitzFResidual block for SemanticEncoder and SemanticDecoder used in Xcodec.configin_channelsout_channelsdilationc           
         t         |           t        j                         | _        |j
                  dz
  dz  |z  }t        j                  |||j
                  d||dd      | _        t        j                  ||dd      | _        y )Nr   r   F)stridepaddingr'   groupsbias)r%   r&   kernel_sizer,   )	super__init__nnELU
activationunit_kernel_sizeConv1dconv1conv2)selfr$   r%   r&   r'   r*   	__class__s         r   r/   zResidualUnit.__init__G   s}    &&(++a/A5AYY##	

 YY<l`ahmn
r   hidden_statereturnc                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }||z   S N)r2   r5   r6   )r7   r9   output_tensors      r   forwardzResidualUnit.forwardW   sE    5

=16

=1m++r   )r   r   r   r   r   intr/   r   Tensorr>   __classcell__r8   s   @r   r#   r#   D   sH    Po| o# oS o\_ o ,ELL ,U\\ ,r   r#   c                   h     e Zd Zdedededef fdZdej                  dej                  fdZ xZ	S )	SemanticEncoderBlockr$   r%   r&   r)   c                    t         |           t        j                  |j                  D cg c]  }t        ||||       c}      | _        |dk(  rdnd|z  }|dz
  dz  }t        j                  |||||d      | _        y c c}w )Nr   r   r   Tr-   r)   r*   r,   )	r.   r/   r0   
ModuleListblock_dilationsr#   	res_unitsr4   conv)	r7   r$   r%   r&   r)   r'   kernelr*   r8   s	           r   r/   zSemanticEncoderBlock.__init__`   s    V\VlVlm(\&+{HEm

 kF
A:!#IIk<VTZdkrvw	 ns   Br9   r:   c                 Z    | j                   D ]
  } ||      } | j                  |      }|S r<   )rI   rJ   r7   r9   units      r   r>   zSemanticEncoderBlock.forwardk   s3    NN 	.D-L	.yy.r   
r   r   r   r   r?   r/   r   r@   r>   rA   rB   s   @r   rD   rD   _   sE    	x| 	x# 	xS 	xZ] 	xELL U\\ r   rD   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )SemanticEncoderc                     t         |           t        |j                        t        |j                        k7  rt        d      t        j                  |j                  |j                  |j                  d|j                  dz  d      | _
        |j                  }g }t        |j                        D ]>  \  }}t        |j                  |j                  |   z        }|t        ||||      gz  }|}@ t        j                  |      | _        y )Nz:Number of strides must match the number of channel_ratios.r   r   Fr,   )r.   r/   lenstrideschannel_ratios
ValueErrorr0   r4   semantic_hidden_sizer-   rJ   	enumerater?   rD   rG   conv_blocks)r7   r$   r%   rZ   ir)   r&   r8   s          r   r/   zSemanticEncoder.__init__s   s    v~~#f&;&;"<<YZZII''''!#
	 11"6>>2 	'IAvv::V=R=RST=UUVL0lTZ[\\K&K	'
 ==5r   r9   r:   c                 Z    | j                  |      }| j                  D ]
  } ||      } |S r<   )rJ   rZ   r7   r9   blocks      r   r>   zSemanticEncoder.forward   s5    yy.%% 	/E .L	/r   r   r   r   r/   r   r@   r>   rA   rB   s   @r   rQ   rQ   r   s#    6,ELL U\\ r   rQ   c                   h     e Zd Zdedededef fdZdej                  dej                  fdZ xZ	S )	SemanticDecoderBlockr$   r%   r&   r)   c                 h   t         	|           |dk(  r!t        j                  ||dddd      | _        n:d|z  }|dz   dz  }|dz  dk(  rdnd}t        j
                  ||||||d      | _        t        j                  |j                  D cg c]  }t        ||||       c}      | _	        y c c}w )	Nr   r   TrF   r   r   FrS   )
r.   r/   r0   r4   rJ   ConvTranspose1drG   rH   r#   rI   )
r7   r$   r%   r&   r)   r-   r*   output_paddingr'   r8   s
            r   r/   zSemanticDecoderBlock.__init__   s    Q;		DI f*Kza'G"(1*/QqN**\;^cDI X^XnXnoH\&,hGo
os   B/r9   r:   c                 Z    | j                  |      }| j                  D ]
  } ||      } |S r<   )rJ   rI   rM   s      r   r>   zSemanticDecoderBlock.forward   s3    yy.NN 	.D-L	.r   rO   rB   s   @r   ra   ra      s@    
| 
# 
S 
Z] 
.ELL U\\ r   ra   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )SemanticDecoderc                     t         |           t        j                  |j                  t        |j                  |j                  d   z        |j                  d|j                  dz  d      | _        g }t        |j                        D ]  \  }}t        |j                  |j                  |   z        }|t        |j                        dz
  k  r)t        |j                  |j                  |dz      z        }n|j                  }|t        ||||      gz  } t        j                  |      | _        t        j                  |j                  |j                  |j                  d|j                  dz  d      | _        y )Nr   r   r   F)r%   r&   r-   r)   r*   r,   )r)   r*   r,   )r.   r/   r0   r4   rX   r?   rV   r-   r5   rY   rU   rT   ra   rG   rZ   r6   )r7   r$   rZ   r[   r)   r%   r&   r8   s          r   r/   zSemanticDecoder.__init__   s_   YY33V886;P;PQR;SST**&&!+

 "6>>2 	]IAvf99F<Q<QRS<TTUKC--.23"6#>#>AVAVWX[\W\A]#]^%::0lTZ[\\K	] ==5YY''''&&!+

r   r9   r:   c                 |    | j                  |      }| j                  D ]
  } ||      } | j                  |      }|S r<   )r5   rZ   r6   r]   s      r   r>   zSemanticDecoder.forward   sC    zz,/%% 	/E .L	/zz,/r   r_   rB   s   @r   rg   rg      s#    
>ELL U\\ r   rg   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )XcodecEuclideanCodebookz!Codebook with Euclidean distance.c                    t         |           t        j                  |j                  |j
                        }|j                  | _        | j                  dt        j                  dg             | j                  dt        j                  |j                               | j                  d|       | j                  d|j                                y )NinitedTcluster_sizeembed	embed_avg)	r.   r/   r   zeroscodebook_sizecodebook_dimregister_bufferr@   clone)r7   r$   ro   r8   s      r   r/   z XcodecEuclideanCodebook.__init__   s    F00&2E2EF#11Xu||TF';<^U[[9M9M-NOWe,[%++-8r   c                    | j                   j                         }|j                  d      j                  dd      }|d|z  |z  z
  |j                  d      j                  dd      z    }|j	                  d      j
                  }|S )Nr   r   T)keepdimr   dim)ro   tpowsummaxindices)r7   hidden_statesro   scaled_statesdist	embed_inds         r   quantizez XcodecEuclideanCodebook.quantize   s    

%))!,00D0A]!2U!::UYYq\=M=MaY]=M=^^_HHH$,,	r   c                     |j                   }|j                  d|d   f      }| j                  |      } |j                  |d d  }|S )Nrx   )shapereshaper   view)r7   r   r   r   s       r   encodezXcodecEuclideanCodebook.encode   sO    ##%--r59o>MM-0	"INNE#2J/	r   c                 F    t        j                  || j                        }|S r<   )F	embeddingro   )r7   r   	quantizeds      r   decodezXcodecEuclideanCodebook.decode   s    KK	4::6	r   )	r   r   r   r   r/   r   r   r   rA   rB   s   @r   rk   rk      s    +9r   rk   c                   4     e Zd ZdZdef fdZd Zd Z xZS )XcodecVectorQuantizationzY
    Vector quantization implementation. Currently supports only euclidean distance.
    r$   c                 B    t         |           t        |      | _        y r<   )r.   r/   rk   codebook)r7   r$   r8   s     r   r/   z!XcodecVectorQuantization.__init__   s    /7r   c                 b    |j                  ddd      }| j                  j                  |      }|S Nr   r   r   )permuter   r   )r7   r   embed_ins      r   r   zXcodecVectorQuantization.encode  s/    %--aA6==''6r   c                 b    | j                   j                  |      }|j                  ddd      }|S r   )r   r   r   )r7   r   r   s      r   r   zXcodecVectorQuantization.decode  s/    ==''	2##Aq!,r   )	r   r   r   r   r   r/   r   r   rA   rB   s   @r   r   r      s    8| 8
r   r   c                        e Zd ZdZdef fdZd ZddefdZdde	j                  de	j                  fdZd	e	j                  de	j                  fd
Z xZS ) XcodecResidualVectorQuantizationzv
    Residual vector quantization implementation. Follows Algorithm 1 in https://huggingface.co/papers/2107.03312
    r$   c                    t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        |j                  | _        |j                  | _	        |j
                  | _        y c c}w r<   )
r.   r/   r0   rG   rangenum_quantizersr   
quantizers
frame_raterr   )r7   r$   _r8   s      r   r/   z)XcodecResidualVectorQuantization.__init__  sk    --SXY_YnYnSo(pa)A&)I(pq ++#11$33 )qs   Bc                 `    t        j                  | j                        | j                  z  dz  S )zReturn bandwidth per quantizer.i  )mathlog2rr   r   )r7   s    r   get_bandwidth_per_quantizerz<XcodecResidualVectorQuantization.get_bandwidth_per_quantizer  s%    yy++,t>EEr   r:   c           	          | j                         }| j                  }|0|dkD  r+t        t        dt	        j
                  ||z                    }|S )z:Return num_quantizers based on specified target bandwidth.        r   )r   r   r?   r~   r   floor)r7   	bandwidthbw_per_qr   s       r    get_num_quantizers_for_bandwidthzAXcodecResidualVectorQuantization.get_num_quantizers_for_bandwidth  sL    335,, Y_ Q

9x3G(H!IJNr   
embeddingsc                     | j                  |      }|}g }| j                  d| D ]:  }|j                  |      }|j                  |      }||z
  }|j	                  |       < t        j                  |      }	|	S )a  
        Encode the input tensor into discrete indices using RVQ, with the number of quantizers selected based on the given bandwidth.
        Each quantizer /codebook residually quantizes the input and returns the nearest indices in terms of Euclidian distance.
        N)r   r   r   r   appendr   stack)
r7   r   r   r   residualall_indices	quantizerr   r   out_indicess
             r   r   z'XcodecResidualVectorQuantization.encode%  s    
 >>yI.9 	(I&&x0G!((1I)+Hw'		(
 kk+.r   codesc                     t        j                  d|j                        }t        |      D ]*  \  }}| j                  |   }|j                  |      }||z   }, |S )z9Decode the given codes to their quantized representation.r   )device)r   tensorr   rY   r   r   )r7   r   quantized_outr[   r   r   r   s          r   r   z'XcodecResidualVectorQuantization.decode5  s^    S>#E* 	6JAw*I!((1I)I5M	6 r   r<   )r   r   r   r   r   r/   r   r?   r   r   r@   r   r   rA   rB   s   @r   r   r     s\    4| 4F#  %,,  ELL U\\ r   r   c                   r    e Zd ZdZeZdZdZdZ e	j                         d        Zd Zd Zed        Zdd
Zy	)XcodecPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    xcodecinput_valuesaudioc                    t        |t        j                        rct        j                  |j
                  d| j                  j                         |j                   t        j                  |j                         yyt        |t        j                  t        j                  f      r?t        j                  |j                         t        j                  |j
                         yt        |t        j                        rt        j                  |j
                         |j                  `t        j                   |j"                  |j$                  |j&                  d   z  z        }t        j(                  |j                  | |       yy|j*                  j,                  dk(  r t        j                  |j.                         yt        |t        j0                        r|j3                          yt        |t        j4                        r#t        j                  |j
                  dd       yt        |t6              r|j8                  j;                         D ]^  }t        |t        j                        st        j<                  |j
                  d       t        j>                  |j                  d       ` |j@                  j;                         D ]^  }t        |t        j                        st        j<                  |j
                  d       t        j>                  |j                  d       ` yt        |tB              rt        jD                  |jF                  tI        jJ                  d	g             t        j                  |jL                         t        j                  |jN                         t        j                  |jP                         yy)
zInitialize the weightsr   )meanstdNr   )abSnake1dg{Gz?)r   T))
isinstancer0   Linearinitnormal_weightr$   initializer_ranger,   zeros_	LayerNorm	GroupNormones_r4   kaiming_normal_r   sqrtr+   r%   r-   uniform_r8   r   alpharc   reset_parameters	EmbeddingXcodecModelacoustic_encodermodulestrunc_normal_	constant_acoustic_decoderrk   copy_rm   r   r@   rn   ro   rp   )r7   modulek	submodules       r   _init_weightsz#XcodecPreTrainedModel._init_weightsK  sv    fbii(LLSdkk6S6ST{{&FKK( 'r|| <=KK$JJv}}%		*  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' &&)3JJv||$ 2 23##%-LLSd;, $44<<> 6	i3&&y'7'7TBNN9>>156 $44<<> 6	i3&&y'7'7TBNN9>>156  78JJv}}ellD6&:;KK++,KK%KK(()	 9r   c                 8   t         j                  j                  j                  j                  } || j
                  j                          || j
                  j                         | j
                  j                  D ]`  } ||j                         |j                  |j                  |j                  fD ]&  } ||j                          ||j                         ( b  || j                  j                  d        || j                  j                  d       | j                  j                  D ]f  } ||j                  d       |j                  |j                  |j                  fD ]*  } ||j                  d        ||j                  d       , h y)znApply weight norm in the acoustic encoder and decoder because the original checkpoint has weight norm applied.r   nameN)r   r0   utilsparametrizationsweight_normr   r5   r6   r^   	res_unit1	res_unit2	res_unit3r   conv_t1)r7   r   r^   res_units       r   apply_weight_normz'XcodecPreTrainedModel.apply_weight_normq  s9   hhnn55AAD))//0D))//0**00 	,E$"__eoouO ,HNN+HNN+,	, 	D))//h?D))//h?**00 	;EH5"__eoouO ;HNN:HNN:;	;r   c                    | j                   | j                  fD ]  }|j                         D ]  }	 t        j                  j
                  j                  |d       t        |d      s<d|j                  v sKt        j                  j
                  j                  j                  |dd         y# t        t        f$ r Y hw xY w)z=Remove the weight norm from the acoustic encoder and decoder.r   r   r   T)leave_parametrizedN)r   r   r   r   r0   r   remove_weight_normrW   AttributeErrorhasattrr   parametrizeremove_parametrizations)r7   r   ms      r   r   z(XcodecPreTrainedModel.remove_weight_norm  s    ,,d.C.CD 	mF^^% mHHNN55ah5G 101h!BTBT6THHNN..FFq(gkFlm	m #N3 s   +B22CCc                 R    dt         j                  ffdt         |            S )zA
        Recursively iterate to fetch all Conv1d layers.
        r   c                     g }t        | t        j                        r|j                  |        | j	                         D ]  }|j                   |              |S r<   )r   r0   r4   r   childrenextend)r   params_listchildget_conv1d_layers_recursives      r   r   zMXcodecPreTrainedModel._get_conv1d_layers.<locals>.get_conv1d_layers_recursive  sZ    K&")),""6*  * G""#>u#EFG r   )r0   Moduletuple)r7   r   r   s     @r   _get_conv1d_layersz(XcodecPreTrainedModel._get_conv1d_layers  s&    
			 
	 0899r   Nc                 V    || }| j                  |      }|D ]  }t        ||      } |S )zo
        For a given module, compute the output length that would be obtained after all Conv1d layers.
        )r   r   )r7   input_lengthr   conv1d_layerslayers        r   _get_conv1d_output_lengthsz0XcodecPreTrainedModel._get_conv1d_output_lengths  sC     >F//7" 	EE/|DL	E r   r<   )r   r   r   r   r   config_classbase_model_prefixmain_input_nameinput_modalitiesr   no_gradr   r   r   r   r   r   r   r   r   r   r   ?  s^    
  L $OU]]_#* #*J;,	m : :&r   r   z$The Xcodec neural audio codec model.)custom_introc                       e Zd Z fdZedej                  fd       Zdej                  dej                  fdZ
e	 	 ddej                  dedz  d	edz  dej                  ez  fd
       Ze	 ddej                  d	edz  dej                  ez  fd       Ze	 	 	 ddej                  dej                  dz  dedz  d	edz  deej                  ej                  f   ez  f
d       Z xZS )r   c                 8   t         |   |       || _        |j                  dz  | _        t        j                  |j                        }|j                  | _	        |j                  | _        | j                  | j                         t        |      | _        t        |      | _        t        j                  |j"                        j%                         | _        t)        j*                  |j,                  |j,                        | _        t)        j*                  |j,                  |j"                  j,                        | _        t)        j*                  |j,                  |j                  j,                        | _        t5        |      | _        | j9                          y )Nr   )r.   r/   r$   
hop_lengthpadr   from_configacoustic_model_configencoderr   decoderr   _adjust_dac_decoderrQ   encoder_semanticrg   decoder_semanticsemantic_model_configevalsemantic_modelr0   r   hidden_sizefcfc1fc2r   r   	post_init)r7   r$   acoustic_modelr8   s      r   r/   zXcodecModel.__init__  s)    $$)"..v/K/KL . 6 6 . 6 6  !6!67 / 7 / 7'33F4P4PQVVX))F..0B0BC99V//1M1M1Y1YZ99V//1M1M1Y1YZ9&A 	r   r  c                 z   | j                         D ]]  }t        |t        j                        st        |j                  t
              r|j                  d   n|j                  }|dz  f|_        _ t        | d      r?t        | j                  t        j                        rt        j                         | _        yyy)z
        DAC implemented in Xcodec is slightly different from the HF version.
        DAC in Xcodec adjusts the output padding in every ConvTranspose1d in the decoder and removes
        the final `nn.Tanh` activation function.
        r   r   tanhN)r   r   r0   rc   r)   r   rd   r   r  TanhIdentity)r  r   r)   s      r   r  zXcodecModel._adjust_dac_decoder  s     oo' 	6F&""4"45-7u-Mq)SYS`S`)/!%	6 7F#
7<<(I;;=GL )J#r   r   r:   c                 T   |d d dd d f   }t        j                  || j                  | j                  f      }t        j                         5  | j	                  |d      }|j
                  }d d d        t        j                  d      }|j                  d      S # 1 sw Y   2xY w)Nr   T)output_hidden_statesr   ry   )r   r	  r   r  r  r   r   r   )r7   r   outputsr   stackeds        r   _extract_semantic_featuresz&XcodecModel._extract_semantic_features  s    #Aq!G,uu\DHHdhh+?@]]_ 	2)),T)RG#11M	2 ++m3|||""	2 	2s    BB'Nr   return_dictc                    ||n| j                   j                  }|j                  d   }|dk7  rt        d|       || j                   j                  d   }n>|| j                   j                  vr&t        d| d| j                   j                   d      | j                  |      j                         }| j                  |j                  dd            }| j                  |j                  d   | j                        |j                  d   k7  r<| j                  t        j                  || j                  | j                  f            }n| j                  |      }t        j                  ||gd      }| j                  |j                  dd            j                  dd      }| j                   j#                  ||      }	|	j                  d	d      }	|s|	S t%        |	      S )
ac  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            Float values of the input audio waveform.
        bandwidth (`float`, *optional*):
            The target bandwidth in (kbps) supports only values in `config.target_bandwidths`.
            Defaults to the highest available bandwidth `4.0` kbps.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`].

        Returns:
            `torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)` containing the discrete encoded audio codes.
        r   zAudio must be mono, but got rx   z)This model doesn't support the bandwidth z. Select one of .r   ry   r   )r$   r#  r   rW   target_bandwidthsr"  detachr  	transposer   r   r   r	  r   catr  r   r   r   )
r7   r   r   r#  channelse_semantic_input
e_semantic
e_acousticr   r   s
             r   r   zXcodecModel.encode  s   & &1%<k$++BYBY%%a(q=;H:FGG55b9Idkk;;;;I;FVW[WbWbWtWtVuuvw   ::<HOOQ**+;+E+Ea+KL
 **<+=+=a+@$BWBWX\f\l\lmn\oo..quu\DHHdhhCW/XYJ..|<JYY
J7Q?
WWZ11!Q78BB1aH
nn++J	B!++Aq1";//r   r   c                 2   ||n| j                   j                  }|j                  dd      }| j                  j	                  |      }| j                  |j                  dd            j                  dd      }| j                  |      }|s|S t        |      S )a  
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`):
            Discrete code indices computed using `model.encode`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`]

        Returns:
            Decoded audio values of shape `(batch_size, channels, num_samples)` obtained using the decoder part of
            Xcodec.
        r   r   r   )r$   r#  r(  r   r   r  r   r!   )r7   r   r#  r   quantized_acousticr   s         r   r   zXcodecModel.decode  s      &1%<k$++BYBY!++Aq1NN))+6	!XXi&9&9!Q&?@JJ1aP,,-?@"<00r   c                     ||n| j                   j                  }|j                  d   }|| j                  ||d      }| j	                  ||      d   dd|f   }|s||fS t        ||      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            The raw float values of the input audio waveform.
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`:
            Discrete code indices computed using `model.encode`.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        return_dict (`bool`, *optional*):
            Whether to return a [`XcodecOutput`] instead of a plain tuple.

        Returns:
            `XcodecOutput` or tuple `(audio_codes, audio_values)`:
            - `audio_codes` of shape `(batch_size, num_quantizers, codes_length)`: the quantized discrete codes.
            - `audio_values` of shape `(batch_size, channels, num_samples)`: the reconstructed audio waveform given the codes.

        Example:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoFeatureExtractor, XcodecModel

        >>> model_id = "hf-audio/xcodec-hubert-librispeech"
        >>> model = XcodecModel.from_pretrained(model_id)
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

        >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
        >>> audio_sample = dataset[0]['audio']['array']

        >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```
        Nrx   F)r#  r   .)r   r   )r$   r#  r   r   r   r   )r7   r   r   r   r#  lengthr   s          r   r>   zXcodecModel.forward2  s    \ &1%<k$++BYBY##B'++lI5+QK{{;K{HKCQXRXQXLY..,OOr   )NNr<   )NNN)r   r   r   r/   staticmethodr0   r   r  r   r   r"  r
   r@   floatboolr   r   r!   r   r   r   r>   rA   rB   s   @r   r   r     sh   & )RYY ) )#u7H7H #UM^M^ #  #'#'	10ll10 4<10 D[	10
 
+	+10 10f  $(1\\1 D[1 
+	+	1 16  ,0"&#'8Pll8P \\D(8P 4<	8P
 D[8P 
u||U\\)	*\	98P 8Pr   r   ))r   r   dataclassesr   	functoolsr   r   torch.nnr0   torch.nn.functional
functionalr    r   r   audio_utilsr   modeling_utilsr   r   r	   r
   autor   configuration_xcodecr   r   r   r!   r   r#   rD   rQ   ra   rg   rk   r   r   r   r   __all__r   r   r   <module>r@     s\   !  !      & / : 0  . 
2; 
2 
2 0+ 0 0 2+ 2 2,299 ,6299 &bii <299 >%bii %Pbii @ryy ,/ryy /d r8 r rj GHuP' uP IuPp 1
2r   