
    i                     R   d Z ddlZddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z#  e!jH                  e%      Z&dZ'	 ddl(m)Z) dZ'e&jU                  d        G d dej\                        Z/e'se)Z/ G d dej\                        Z0 G d dej\                        Z1 G d dej\                        Z2 G d d ej\                        Z3 G d! d"ej\                        Z4 G d# d$ej\                        Z5 G d% d&e      Z6e G d' d(e             Z7 G d) d*e7      Z8 G d+ d,ej\                        Z9 ed-.       G d/ d0e7e             Z:d0d(gZ;y# e+$ r Y e,$ r e&j[                  d       Y w xY w)1zPyTorch Pop2Piano model.    N)nn)CrossEntropyLoss)GenerationConfig   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutput)PreTrainedModel)auto_docstringis_torchdynamo_compilinglogging   )Pop2PianoConfigT)FusedRMSNormFzVDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNormzIDiscovered apex but it failed to load, falling back to Pop2PianoLayerNormc                   &     e Zd Zd fd	Zd Z xZS )Pop2PianoLayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zj
        Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      l/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/modeling_pop2piano.pyr   zPop2PianoLayerNorm.__init__8   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor   float32powmeanrsqrtr"   r!   dtypefloat16bfloat16)r#   hidden_statesvariances      r'   forwardzPop2PianoLayerNorm.forward@   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r(   )gư>)__name__
__module____qualname__r   r7   __classcell__r&   s   @r'   r   r   7   s    $+r(   r   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoDenseActDenseconfigc                 ^   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _
        t        |j                     | _        y NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr#   r?   r&   s     r'   r   zPop2PianoDenseActDense.__init__V   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r(   c                    | j                  |      }| j                  |      }| j                  |      }t        | j                  j
                  t        j                        r|j                  | j                  j
                  j                  k7  r`| j                  j
                  j                  t        j                  k7  r/|j                  | j                  j
                  j                        }| j	                  |      }|S N)rG   rM   rK   
isinstancerH   r!   r   Tensorr2   int8r-   )r#   r5   s     r'   r7   zPop2PianoDenseActDense.forward]   s    ./]3tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r(   r8   r9   r:   r   r   r7   r;   r<   s   @r'   r>   r>   U   s    / /r(   r>   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoDenseGatedActDenser?   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y rA   )r   r   r   rD   rE   rF   wi_0wi_1rH   rI   rJ   rK   r   rL   rM   rN   s     r'   r   z$Pop2PianoDenseGatedActDense.__init__m   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r(   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rP   )rM   rX   rY   rK   rQ   rH   r!   r   rR   r2   rS   r-   )r#   r5   hidden_geluhidden_linears       r'   r7   z#Pop2PianoDenseGatedActDense.forwardu   s    hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r(   rT   r<   s   @r'   rV   rV   l   s    / /r(   rV   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoLayerFFr?   c                    t         |           |j                  rt        |      | _        nt        |      | _        t        |j                  |j                        | _	        t        j                  |j                        | _        y )Nr%   )r   r   is_gated_actrV   DenseReluDenser>   r   rE   layer_norm_epsilon
layer_normr   rI   rJ   rK   rN   s     r'   r   zPop2PianoLayerFF.__init__   s_    "=f"ED"8"@D,V^^AZAZ[zz&"5"56r(   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S rP   )rd   rb   rK   )r#   r5   forwarded_statess      r'   r7   zPop2PianoLayerFF.forward   s=    ??=9../?@%5E(FFr(   rT   r<   s   @r'   r^   r^      s    7 7r(   r^   c                   f     e Zd Z	 	 ddededz  f fdZed	d       Zd
dZ	 	 	 	 	 	 	 	 ddZ	 xZ
S )Pop2PianoAttentionNr?   	layer_idxc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  | j                  z  | _        || _        |9| j                  r-t        j!                  d| j"                  j$                   d       t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        | j                  r/t'        j2                  | j                  | j                        | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrB   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerE   d_kvkey_value_proj_dim	num_headsn_headsrJ   rK   	inner_dimri   loggerwarning_oncer&   r8   r   rD   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr#   r?   rl   ri   r&   s       r'   r   zPop2PianoAttention.__init__   si    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(&+#r(   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r*   r   )r-   r   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r'   _relative_position_bucketz,Pop2PianoAttention._relative_position_bucket   s(   , AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r(   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  || j                   | j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r2   device)r   r   r   )r*   r   r   r   )r{   r!   r   r   aranger   r-   r   rk   rm   rn   permute	unsqueeze)
r#   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r'   compute_biaszPop2PianoAttention.compute_bias   s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r(   c
                    |j                   dd \  }
}|du}| j                  |      }|j                  |
d| j                  | j                        j                  dd      }d}t        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rG|j                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |
d| j                  | j                        j                  dd      }|j                  |
d| j                  | j                        j                  dd      }|T|s|	nd}	|j%                  ||| j                  d|	i      \  }}|r)t        |t              rd|j                  | j                  <   t'        j(                  ||j                  dd            }||j                   d	   }||n|	d   dz   }| j*                  sZt'        j,                  d| j                  ||f|j.                  |j0                  
      }| j2                  rE| j4                  r9d|_        n1| j9                  |||j.                  |	      }|dddd| dddf   }|#|ddddddd|j                   d	   f   }||z   }|}||z  }t:        j<                  j?                  |jA                         d      jC                  |      }t:        j<                  jE                  || jD                  | j4                        }t'        j(                  ||      }|j                  dd      jG                         }|j                  |
d| jH                        }| jK                  |      }||f}|r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr*   r+   r   Fr   Tr   )r   r2   )r   r   dim)ptraining)&shaperv   viewrr   rp   	transposerQ   r   
is_updatedgetri   cross_attention_cacheself_attention_cachelayerskeysr   rw   rx   updater   matmulrl   zerosr   r2   r|   r   requires_gradr   r   
functionalsoftmaxr   type_asrK   
contiguousrs   ry   )r#   r5   maskkey_value_statesposition_biaspast_key_valuesr   	use_cacheoutput_attentionsr   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                              r'   r7   zPop2PianoAttention.forward  s   " "/!4!4Ra!8
J .T9vvm,#((RtG^G^_iijkmno 
o':;(3377GJ!'6'L'L$'6'K'K$#2 -?)]/"=*-44T^^DIIJ/66t~~FMML/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~> lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!&&z2t~~Fff[)./Gr(   FN)T       )NN)NNNNNFFN)r8   r9   r:   r   intr   staticmethodr   r   r7   r;   r<   s   @r'   rh   rh      sa     %* $	 , , :	 ,D -  - ^. br(   rh   c                   @     e Zd Zddedz  f fdZ	 	 	 	 	 	 ddZ xZS )Pop2PianoLayerSelfAttentionNri   c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nrl   ri   r`   )r   r   rh   SelfAttentionr   rE   rc   rd   r   rI   rJ   rK   r}   s       r'   r   z$Pop2PianoLayerSelfAttention.__init__k  sT    /0KW`
 -V^^AZAZ[zz&"5"56r(   c           	          | j                  |      }| j                  |||||||      }	|| j                  |	d         z   }|f|	dd  z   }
|
S )N)r   r   r   r   r   r   r   r   )rd   r   rK   )r#   r5   attention_maskr   r   r   r   r   normed_hidden_statesattention_outputr   s              r'   r7   z#Pop2PianoLayerSelfAttention.forwards  ss      $}=-- '+/) . 
 &5Ea5H(II "%5ab%99r(   r   )NNNFFNr8   r9   r:   r   r   r7   r;   r<   s   @r'   r   r   j  s-    7SSWZ 7 r(   r   c                   B     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 ddZ xZS )Pop2PianoLayerCrossAttentionNri   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFr   r`   )r   r   rh   EncDecAttentionr   rE   rc   rd   r   rI   rJ   rK   )r#   r?   ri   r&   s      r'   r   z%Pop2PianoLayerCrossAttention.__init__  sO    1&V[gpq,V^^AZAZ[zz&"5"56r(   c
                     | j                  |      }
| j                  |
||||||||		      }|| j                  |d         z   }|f|dd  z   }|S )N)r   r   r   r   r   r   r   r   r   r   )rd   r   rK   )r#   r5   r   r   r   r   r   r   r   r   r   r   layer_outputr   s                 r'   r7   z$Pop2PianoLayerCrossAttention.forward  sx      $}=// -'+%/) 0 

 %t||4DQ4G'HH/$4QR$88r(   rP   )NNNFNFNr   r<   s   @r'   r   r     s/    7#* 7 r(   r   c                   H     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )Pop2PianoBlockNri   c                 p   t         |           |j                  | _        t        j                         | _        | j
                  j                  t        |||             | j                  r&| j
                  j                  t        ||             | j
                  j                  t        |             y )Nr   )ri   )
r   r   rk   r   
ModuleListlayerappendr   r   r^   r}   s       r'   r   zPop2PianoBlock.__init__  s     ++]]_


'4O[d	

 ??JJ:6YWX

*623r(   c           
          | j                   d   ||||||	|      }|d   }|dd  }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }| j                  xr |d u}|r | j                   d   ||||||d   dz   ||	      }|d   }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }||dd  z   } | j                   d   |      }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }|f}||z   S )Nr   )r   r   r   r   r   r   r   i  )r   maxr+   )r   r   r   r   r   r   r   )r   r2   r   r3   r   isinfanyfinfor   clamprk   )r#   r5   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                     r'   r7   zPop2PianoBlock.forward  sl    "/A)'+/)"
 /q12126 %--/++M*..0M//044t;M//044K
 "KKK<[YM!__R1Fd1R&3djjm!65; /+B/!3#"3	'# 4A6M ""emm3#kkKK.224KK 3 34884?KK 3 3488
 !&M|Q\ ] !24KAB4O O '

2}5 %--/++M*..0M//044t;M//044K
 "KKK<[YM " ''	
r(   r   )
NNNNNNFFTNr   r<   s   @r'   r   r     s:    4SSWZ 4" "#&*M
r(   r   c                   h    e Zd ZU eed<   dZdZdZdZdgZ	dgZ
 ej                         d        Zd	 Zy
)Pop2PianoPreTrainedModelr?   transformer)audioTFr   rH   c                 0   | j                   j                  }t        |t              r$t	        j
                  |j                  |dz         yt        |t              r0t	        j                  |j                  j                  d|dz         yt        |t              rlt	        j                  |j                  j                  d|dz         t        |d      r0t	        j                  |j                  j                  d|dz         yyt        |t              r9t	        j                  |j                  j                  d|| j                   j                   dz  z         t        |j                  d      r?|j                  j"                  )t	        j$                  |j                  j"                         t	        j                  |j&                  j                  d|| j                   j(                  dz  z         t        |j&                  d      rA|j&                  j"                  *t	        j$                  |j&                  j"                         yyyt        |t*              rt	        j                  |j,                  j                  d|| j                   j                   dz  z         t        |j,                  d      r?|j,                  j"                  )t	        j$                  |j,                  j"                         t	        j                  |j.                  j                  d|| j                   j                   dz  z         t        |j.                  d      r?|j.                  j"                  )t	        j$                  |j.                  j"                         t	        j                  |j&                  j                  d|| j                   j(                  dz  z         t        |j&                  d      rA|j&                  j"                  *t	        j$                  |j&                  j"                         yyyt        |t0              rP| j                   j                   }| j                   j2                  }| j                   j4                  }t	        j                  |j6                  j                  d|||z  dz  z         t	        j                  |j8                  j                  d||dz  z         t	        j                  |j:                  j                  d||dz  z         t	        j                  |j<                  j                  d|||z  dz  z         |j>                  r3t	        j                  |j@                  j                  d||dz  z         yyy)zInitialize the weights      ?        )r0   stdlm_head      rC   N)!r?   initializer_factorrQ   r   init	constant_r!   Pop2PianoConcatEmbeddingToMelnormal_	embedding!Pop2PianoForConditionalGenerationsharedhasattrr   r>   rG   rE   rC   zeros_rH   rF   rV   rX   rY   rh   ro   rq   rv   rw   rx   ry   rl   r{   )r#   modulefactorrE   rp   rr   s         r'   _init_weightsz&Pop2PianoPreTrainedModel._init_weights  s    //f01NN6==&3,7 =>LL))00sM ABLL--CVc\Jvy)V^^22&3,O * 67LL))DKKDWDW\`C`9abvyy&)fiinn.HFIINN+LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I) ;<LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I) 23kk))G!%!1!1kk++GLLs7M_C_dhBh8ijLLs'4-8PQLLs'4-8PQLLs7M_C_dhBh8ij11V;;BBRX]dim\mRno 2 4r(   c                 8   | j                   j                  }| j                   j                  }|t        d      |j	                  |j
                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |dk(  |       |S )Nzoself.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id..r+   r   ).r   z1self.model.config.pad_token_id has to be defined.)r?   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r#   	input_idsr  r  shifted_input_idss        r'   _shift_rightz%Pop2PianoPreTrainedModel._shift_rightD  s    !%!C!C{{//!) B  &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r(   N)r8   r9   r:   r   __annotations__base_model_prefixoutput_modalitiessupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulesr   no_gradr
  r   r(   r'   r   r     sS    %"&*#")*!FU]]_%p %pN!r(   r   c                   B     e Zd Z fdZd Z	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )Pop2PianoStackc                    t         |   |       t        j                  |j                  |j
                        | _        |j                  | _        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t        |j
                  |j                        | _        t        j"                  |j$                        | _        | j)                          d| _        y c c}w )Nr   r   r`   F)r   r   r   rz   
vocab_sizerE   embed_tokensrk   r   range
num_layersr   boolblockr   rc   final_layer_normrI   rJ   rK   	post_initr|   )r#   r?   ir&   s      r'   r   zPop2PianoStack.__init__[  s     LL):):FNNK ++]] v001 v4Q<[\]

 !36>>vG`G` azz&"5"56 	&+#s   7!Dc                     || _         y rP   )r#  r#   new_embeddingss     r'   set_input_embeddingsz#Pop2PianoStack.set_input_embeddingso  s
    *r(   c                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|$|"| j
                  rdnd}t        d| d| d      |&|j                         }|j                  d|d         }n8||j                         d d }n"| j
                  rdnd}t        d| d| d	      | j                  r%| j                  r|rt        j                  d
       d}|(| j                  t        d      | j                  |      }|\  }}|du r| j
                  st        d|  d      | j
                  rf|rr|p| j                   j                  r5t        t!        | j                         t!        | j                               }n%t!        | j                         }n| j
                  sd }||j#                         nd}|%t%        j&                  |||z   |j(                        }|1t+               s'||z   }t%        j,                  |||j(                        }| j                   j
                  rt/        | j                   ||||      }nX|d d d d d d f   }|j1                  |j2                        }d|z
  t%        j4                  |j2                        j6                  z  }| j
                  rO|M|j                         \  }}}||f}|!t%        j,                  ||j(                        }| j9                  |      }nd }|	rdnd }|rdnd }|r| j
                  rdnd }d }d }| j;                  |      }t=        | j>                        D ]b  \  }} |	r||fz   } | ||||||||||
      }!|!d   }|!d   }| j
                  r|	|!|rdnd   }|sD||!d   fz   }| j
                  sZ||!d   fz   }d | jA                  |      }| j;                  |      }|	r||fz   }|
stC        d |||||fD              S tE        |||||      S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer+   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoder)r?   r   r   )r?   r2  r   r   r   )r2   r   r  )r   r   r   r   r   r   r*      c              3   $   K   | ]  }|| 
 y wrP   r  ).0rx   s     r'   	<genexpr>z)Pop2PianoStack.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_stater   r5   
attentionscross_attentions)#r?   r   r   output_hidden_statesuse_return_dictrk   r  sizer   r|   r   rt   ru   r#  is_encoder_decoderr   r
   get_seq_lengthr   r   r   r   r    r   r-   r2   r   r   invert_attention_maskrK   	enumerater'  r(  tupler   )"r#   r  r   r   r   r2  r   r   r   r;  r   r   kwargserr_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r   r5   r*  layer_modulelayer_outputss"                                     r'   r7   zPop2PianoStack.forwardr  s    "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D4zAO"ZZ
OML`L`aN;;!!,{{+-- /K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+"6BD0d&7DOOrRV(,%]3(4 	VOA|#$58H$H!(%/- /#"3-M *!,M
 *!,M#8#D0=CTaZ[0\- !/=3C2E!E??+?=QRCSBU+U(;	V> --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r(   )NNNNNNNNNNN)r8   r9   r:   r   r.  r7   r;   r<   s   @r'   r   r   Y  s6    ,(+
 "#!f
r(   r   c                   (     e Zd ZdZ fdZd Z xZS )r  z'Embedding Matrix for `composer` tokens.c                     t         |           t        j                  |j                  |j
                        | _        y )N)num_embeddingsembedding_dim)r   r   r   rz   composer_vocab_sizerE   r  rN   s     r'   r   z&Pop2PianoConcatEmbeddingToMel.__init__  s-    V5O5O_e_m_mnr(   c                     ||z
  }| j                  |      j                  d      }t        j                  ||gd      }|S )Nr   r   )r  r   r   cat)r#   featureindex_valueembedding_offsetindex_shiftedcomposer_embeddingr2  s          r'   r7   z%Pop2PianoConcatEmbeddingToMel.forward"  sC    #&66!^^M:DDQG		#5w"?QGr(   )r8   r9   r:   __doc__r   r7   r;   r<   s   @r'   r  r    s    1or(   r  zA
    Pop2Piano Model with a `language modeling` head on top.
    )custom_introc            #           e Zd ZdddZdef fdZd Zd Z	 ddej                  d	e
d
edej                  dz  fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  deeej"                        dz  dedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                     ez  f d       Z ej,                         	 	 	 d  fd	       Zdej"                  fdZ xZS )!r  zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr?   c                 4   t         |   |       || _        |j                  | _        t        j                  |j                  |j                        | _        t        |      | _
        t        j                  |      }d|_        d|_        t        |      | _        t        j                  |      }d|_        |j"                  |_        t        |      | _        t        j(                  |j                  |j                  d      | _        | j-                          y )NFTrB   )r   r   r?   rE   	model_dimr   rz   r"  r  r  mel_conditionercopydeepcopyrk   r   r   encodernum_decoder_layersr%  decoderrD   r   r)  )r#   r?   encoder_configdecoder_configr&   s       r'   r   z*Pop2PianoForConditionalGeneration.__init__4  s     ll6#4#4fnnE<VDv.$)!#( %n5v.$(!$*$=$=!%n5yy1B1BO 	r(   c                     | j                   S rP   )r  )r#   s    r'   get_input_embeddingsz6Pop2PianoForConditionalGeneration.get_input_embeddingsM  s    {{r(   c                 ~    || _         | j                  j                  |       | j                  j                  |       y rP   )r  rf  r.  rh  r,  s     r'   r.  z6Pop2PianoForConditionalGeneration.set_input_embeddingsP  s-    $)).9)).9r(   Ninput_featurescomposergeneration_configr   c                    |j                   }||vr(t        dt        |j                                d|       ||   }t	        j
                  || j                        }|j                  |j                  d         }t        |j                               }| j                  |||      }|Od||dddf   j                          <   t	        j                  |dddf   j                  dd	      |gd	
      }||fS |dfS )a  
        This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
        control the type of MIDI token generated by the model.

        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                input features extracted from the feature extractor.
            composer (`str`):
                composer token which determines the type of MIDI tokens to be generated.
            generation_config (`~generation.GenerationConfig`):
                The generation is used to get the composer-feature_token pair.
            attention_mask (``, *optional*):
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
        zPlease choose a composer from z. Composer received - r3  r   )rY  rZ  r[  Nr   r+   r   )axis)composer_to_feature_tokenr  listr   r   tensorr   repeatr   r   r   rc  r&  concatenater   )r#   rn  ro  rp  r   rs  composer_valuer[  s           r'   get_mel_conditioner_outputsz=Pop2PianoForConditionalGeneration.get_mel_conditioner_outputsU  s%   0 %6$O$O!4406O6T6T6V1W0XXnownxy  38<nT[[I'..~/C/CA/FG8??AB--"&- . 

 %;>NN1a4055778 #..q!t0D0I0I"a0PR`/ahijN!>11t##r(   r  decoder_input_idsdecoder_attention_maskencoder_outputsr   r2  decoder_inputs_embedslabelsr   r   r;  r   r   returnc                    ||n| j                   j                  }||n| j                   j                  }||t        d      |||}|| j	                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|
||	| j                  |
      }| j                  |||	||||||||      }|d   }| j                   j                  r|| j                  d	z  z  }| j                  |      }d}|
Ct        d
      } ||j                  d|j                  d            |
j                  d            }|s|f|dd z   |z   }||f|z   S |S t!        |||j"                  |j$                  |j&                  |j(                  |j*                  |j$                  |j&                  	      S )aq  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pop2Piano is a model with relative position embeddings
            so you should be able to pad the inputs on both the right and the left. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail.
            [What are input IDs?](../glossary#input-ids) To know more on how to prepare `input_ids` for pretraining
            take a look a [Pop2Piano Training](./Pop2Piano#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
            [What are decoder input IDs?](../glossary#decoder-input-ids) Pop2Piano uses the `pad_token_id` as the
            starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`
        NzSBoth `inputs_embeds` and `input_features` received! Please provide only one of them)r  r   r2  r   r;  r   r   r   r*   )r8  r5   r9  )r  r   r2  r   r   r   r   r   r;  r   r   r   r  )ignore_indexr+   )	losslogitsr   decoder_hidden_statesdecoder_attentionsr:  encoder_last_hidden_stater   encoder_attentions)r?   r   r<  r  rf  rQ   r   lenr  rh  tie_word_embeddingsrb  r   r   r   r=  r   r   r5   r9  r:  r8  )r#   r  r   rz  r{  r|  r   r2  rn  r}  r~  r   r   r;  r   r   rC  r5   decoder_outputssequence_output	lm_logitsr  loss_fctoutputs                           r'   r7   z)Pop2PianoForConditionalGeneration.forward  s3   R "+!6IDKK<Q<Q	%0%<k$++B]B]$)Crss'M,A*M ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1/!5#) ' 
 *!,;;**-1EFOLL1	'T:HINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r(   c                    || j                   } |j                  d	i | t        |d      st        d      t	        |j
                        | j                  j                  k7  r9t        d| j                  j                   dt	        |j
                         d      | j                  ||||      \  }}t        | (  d	d|||d|S )
a  
        Generates token ids for midi outputs.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
        strategies and code examples, check out the [following guide](./generation_strategies).

        </Tip>

        Parameters:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
            attention_mask:
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
            composer (`str`, *optional*, defaults to `"composer1"`):
                This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
                `"composer"`. Please make sure that the composer value is present in `composer_to_feature_token` in
                `generation_config`. For an example please see
                https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            kwargs:
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
                Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:
                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        Nrs  z`composer_to_feature_token` was not found! Please refer to https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.jsonand parse a dict like that.ztconfig.composer_vocab_size must be same as the number of keys in generation_config.composer_to_feature_token! Found z vs .)rn  r   ro  rp  )inputsr2  r   rp  r  )rp  r   r  r  r  rs  r?   rV  ry  r   generate)r#   rn  r   ro  rp  rC  r&   s         r'   r  z*Pop2PianoForConditionalGeneration.generate  s   l $ $ 6 6   *6* (*EF.   ::;t{{?^?^^889cBSBmBm>n=oopr  *.)I)I))/	 *J *
& w 
()/	

 
 	
r(   c                 $    | j                  |      S rP   )r  )r#   r~  s     r'   %prepare_decoder_input_ids_from_labelszGPop2PianoForConditionalGeneration.prepare_decoder_input_ids_from_labelsT  s      ((r(   rP   )NNNNNNNNNNNNNNN)N	composer1N)r8   r9   r:   _tied_weights_keysr   r   rl  r.  r   FloatTensorstrr   ry  r   
LongTensor
BoolTensorrB  rR   r	   r&  r   r7   r  r  r  r;   r<   s   @r'   r  r  )  s3    (7'6
 2: 48/$))/$ /$ ,	/$
 ))D0/$b  .23759:>=A(,2637:>*.!%)-,0#'26!q
##d*q
 ))D0q
 !++d2	q

 !& 0 04 7q
 uU\\23d:q
 q
 ((4/q
 ))D0q
  %0047q
   4'q
 $;q
  $;q
 #Tkq
 D[q
  ((4/!q
$ 
u  	!O	3%q
 q
f U]]_ W
 W
r)ELL )r(   r  )<r^  rd  r   r   r   torch.nnr   transformers.generationr   r1  r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   r   configuration_pop2pianor   
get_loggerr8   rt   _load_pop2piano_layer_normapex.normalizationr   infoImportError	ExceptionwarningModuler   r>   rV   r^   rh   r   r   r   r   r   r  r  __all__r  r(   r'   <module>r     s        % 4 & ! C C ) / 9 k k - F F 4 
		H	%! 
`/!&
KKhi+ +2 "%RYY .")) <ryy &I IZ")) F!299 !J\
/ \
~ D! D! D!N
- 
DBII  
g)(@/ g)
g)T	 /0J
KS!  	 `
NN^_`s   >F F&F&%F&