
    iW~                        d Z ddlZddlZddlmc mZ ddlmZ ddlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ  ej:                  e      Z G d dej@                        Z! G d dej@                        Z" G d dej@                        Z# G d dej@                        Z$ G d dej@                        Z% G d dej@                        Z& G d dej@                        Z' G d dej@                        Z( G d d ej@                        Z) G d! d"ej@                        Z* G d# d$ej@                        Z+e G d% d&e             Z,e G d' d(e,             Z- ed)*       G d+ d,e,e             Z.g d-Z/y).zPyTorch CPMAnt    N)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringlogging   )CpmAntConfigc                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )CpmAntLayerNormz~
    We use Root Mean Square (RMS) Layer Normalization, please see https://huggingface.co/papers/1910.07467 for details."
    configc                     t         |           |j                  | _        |j                  | _        t        j                  t        j                  |j                              | _	        y N)
super__init__epshidden_sizedim_normr   	Parametertorchemptyweightselfr   	__class__s     f/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/cpmant/modeling_cpmant.pyr   zCpmAntLayerNorm.__init__)   sE    ::**ll5;;v/A/A#BC    hidden_statesc                 p   |j                  d      | j                  k7  rt        d      |j                  }|j	                  t
        j                        j                  d      j                  dd      }|t        j                  || j                  z         z  j	                  |      | j                  z  }|S )f
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        z'hidden_states.size(-1) != self.dim_norm   T)dimkeepdim)sizer   AssertionErrordtypetor   float32powmeanrsqrtr   r   )r!   r%   	old_dtypevariances       r#   forwardzCpmAntLayerNorm.forward0   s    
 b!T]]2 !JKK!''	 ##EMM266q9>>2t>T&X5H)IIMMiX[_[f[ffr$   )
__name__
__module____qualname____doc__r   r   r   Tensorr6   __classcell__r"   s   @r#   r   r   $   s&    D| D
U\\ 
r$   r   c                        e Zd Zddef fdZ	 	 	 	 ddej                  dej                  dej                  dej                  dedz  d	e	dz  d
edz  dej                  dz  fdZ
 xZS )CpmAntAttentionNr   c                 V   t         |           |j                  | _        |j                  | _        |j                  | _        || _        t        j                  | j                  | j
                  | j                  z  d      | _
        t        j                  | j                  | j
                  | j                  z  d      | _        t        j                  | j                  | j
                  | j                  z  d      | _        t        j                  | j
                  | j                  z  | j                  d      | _        t        j                  j                  d      | _        |j"                  0t        j                  j%                  |j"                        | _        y d | _        y )NFbiasr(   r*   )p)r   r   r   	dim_modelnum_attention_heads	num_headsdim_head	layer_idxr   Linear	project_q	project_k	project_vattention_outr   Softmaxsoftmax	dropout_pDropoutdropoutr!   r   rI   r"   s      r#   r   zCpmAntAttention.__init__>   s   ++33"4>>4>>DMM3QX]^4>>4>>DMM3QX]^4>>4>>DMM3QX]^YYt~~'Et~~\abxx''B'/' 88++f.>.>+?DLDLr$   hidden_q	hidden_kvattention_maskposition_biasoutput_attentionspast_key_values	use_cachecache_positionc	           	         |j                  d      }	|j                  d      }
|j                  d      }| j                  |      }| j                  |      }| j                  |      }|j	                  |	|
| j
                  | j                        j                  dddd      }|j	                  |	|| j
                  | j                        j                  dddd      }|j	                  |	|| j
                  | j                        j                  dddd      }|4|j                  ||| j                  d|i      \  }}|j                  d      }t        j                  ||j                  dd            t        j                  | j                        z  }||z   }t        j                  ||j	                  |	d|
|      t        j                   d	      k(  t        j"                  t%        d
      |j&                  |j(                              }| j+                  |      }t        j                  ||j	                  |	d|
|      t        j                   d	      k(  t        j"                  d|j&                  |j(                              }|r|}nd}| j,                  | j-                  |      }t        j                  ||      }|j	                  |	| j
                  |
| j                        j                  dddd      }|j/                         j	                  |	|
| j
                  | j                  z        }| j1                  |      }||fS )ad  
        Args:
            hidden_q (`torch.Tensor`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r   r   r)   r   Nr\   r(   Fz-inf)devicer.   )r,   rK   rL   rM   viewrG   rH   permuteupdaterI   r   matmul	transposemathsqrtmasked_filltensorscalar_tensorfloatr_   r.   rP   rS   
contiguousrN   )r!   rU   rV   rW   rX   rY   rZ   r[   r\   
batch_sizelen_qlen_kquerykeyvaluescoreattn_weightss                    r#   r6   zCpmAntAttention.forwardR   s   : ]]1%
a q!x(nnY'y)

:udnndmmLTTUVXY[\^_`hhz5$..$--HPPQRTUWXZ[\

:udnndmmLTTUVXY[\^_`&(//UDNNM]_mLnoJCHHRLE UCMM"b$9:TYYt}}=UU%!!
Aue<U@SSfell%++V

 U#!!
Aue<U@SS%,,ekkJ

  LL<<#LL'E UE*

:t~~udmmLTTUVXY[\^_`  "''
E4>>DMM;YZ""5)l""r$   r   )FNNN)r7   r8   r9   r   r   r   r;   
BoolTensorboolr   r6   r<   r=   s   @r#   r?   r?   =   s     |  4 */(,!%.2M#,,M# <<M# ((	M#
 ||M#  $;M# M# $;M# t+M#r$   r?   c                        e Zd Zddef fdZ	 	 	 	 	 ddej                  dej                  dej                  dz  dedz  dedz  d	edz  d
ej                  dz  fdZ	 xZ
S )CpmAntSelfAttentionBlockNr   c                     t         |           t        |      | _        t	        ||      | _        |j                  r/t        j                  j                  |j                        | _
        y d | _
        y N)rI   )r   r   r   layernorm_before_attentionr?   self_attentionrQ   r   r   rR   rS   rT   s      r#   r   z!CpmAntSelfAttentionBlock.__init__   sV    *9&*A'-f	J 88++F,<,<=DLDLr$   r%   rW   rX   rY   rZ   r[   r\   c           
          | j                  |      }| j                  ||||||||      \  }}	| j                  | j                  |      }||z   }||	fS )a  
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )rz   r{   rS   )
r!   r%   rW   rX   rY   rZ   r[   r\   outputsrs   s
             r#   r6   z CpmAntSelfAttentionBlock.forward   sp    4 11-@ $ 3 3	!
 <<#ll7+G%/l**r$   r   NFNNNr7   r8   r9   r   r   r   r;   ru   r   r6   r<   r=   s   @r#   rw   rw      s     |   .2).(,!%.2*+||*+ *+ ||d*	*+
  $;*+ *+ $;*+ t+*+r$   rw   c                   D     e Zd Zdef fdZdej                  fdZ xZS )CpmAntDenseGatedACTr   c                 ,   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  j                         | _
        y NFrA   )r   r   r   rJ   r   dim_ffw_0w_1r   GELUactr    s     r#   r   zCpmAntDenseGatedACT.__init__   s[    99V//UK99V//UK88==?r$   r%   c                 r    | j                  | j                  |            }| j                  |      }||z  }|S )zTransform an input tensor from one feature space to another via a nonlinear operation

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        )r   r   r   )r!   r%   
gate_scores      r#   r6   zCpmAntDenseGatedACT.forward   s9     XXdhh}56
/"]2r$   	r7   r8   r9   r   r   r   r;   r6   r<   r=   s   @r#   r   r      s    #| #
U\\ 
r$   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )CpmAntFeedForwardr   c                 (   t         |           t        |      | _        |j                  /t
        j                  j                  |j                        | _        nd | _        t        j                  |j                  |j                  d      | _        y r   )r   r   r   w_inrQ   r   r   rR   rS   rJ   r   r   w_outr    s     r#   r   zCpmAntFeedForward.__init__   sg    '/	' 88++F,<,<=DLDLYYv}}f.@.@uM
r$   r%   c                     | j                  |      }| j                  | j                  |      }| j                  |      }|S )r'   )r   rS   r   r!   r%   s     r#   r6   zCpmAntFeedForward.forward   s>    
 		-0<<# LL7M

=1r$   r   r=   s   @r#   r   r      s!    N| NU\\ r$   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )CpmAntFFNBlockr   c                     t         |           t        |      | _        t	        |      | _        |j                  r/t        j                  j                  |j                        | _
        y d | _
        y r   )r   r   r   layernorm_before_ffnr   ffnrQ   r   r   rR   rS   r    s     r#   r   zCpmAntFFNBlock.__init__  sS    $3F$;!$V, 88++F,<,<=DLDLr$   r%   c                     | j                  |      }| j                  |      }| j                  | j                  |      }||z   }|S )z
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Hidden states before feed forward layer.
        )r   r   rS   )r!   r%   
ln_outputsr}   s       r#   r6   zCpmAntFFNBlock.forward  sJ     ..}=
((:&<<#ll7+G%/r$   r   r=   s   @r#   r   r     s      |  ||r$   r   c                        e Zd Zddef fdZ	 	 	 	 	 ddej                  dej                  dej                  dz  dedz  dedz  d	edz  d
ej                  dz  fdZ	 xZ
S )CpmAntTransformerBlockNr   c                 f    t         |           t        ||      | _        t	        |      | _        y ry   )r   r   rw   self_attr   r   rT   s      r#   r   zCpmAntTransformerBlock.__init__#  s(    09M!&)r$   r%   rW   rX   rY   rZ   r[   r\   c           	      b    | j                  |||||||      \  }}| j                  |      }||fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )rW   rX   rY   rZ   r[   r\   )r   r   )	r!   r%   rW   rX   rY   rZ   r[   r\   rs   s	            r#   r6   zCpmAntTransformerBlock.forward(  sL    4 '+mm)'/+) '4 '
#| /l**r$   r   r~   r   r=   s   @r#   r   r   "  s    *| * .2).(,!%.2%+||%+ %+ ||d*	%+
  $;%+ %+ $;%+ t+%+r$   r   c                        e Zd Zdef fdZ	 	 	 	 	 ddej                  dej                  dej                  dedz  dedz  d	edz  d
edz  dej                  dz  fdZ	 xZ
S )CpmAntEncoderr   c           	          t         |           |j                  | _        t	        j
                  t        | j                        D cg c]  }t        ||       c}      | _        t        |      | _
        y c c}w ry   )r   r   num_hidden_layers
num_layersr   
ModuleListranger   layersr   output_layernorm)r!   r   ir"   s      r#   r   zCpmAntEncoder.__init__Q  s_     22mmZ_`d`o`oZp$qUV%;Fa%P$qr / 7 %rs   A8Nr%   rW   rX   rY   output_hidden_statesrZ   r[   r\   c	           	          |rdnd}	|rdnd}
t        | j                        D ])  \  }}|r|	|fz  }	 |||||||      }|\  }}|s$|
|fz  }
+ | j                  |      }|r|	|fz  }	||	|
fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
         N)rY   rZ   r[   )	enumerater   r   )r!   r%   rW   rX   rY   r   rZ   r[   r\   all_hidden_statesall_self_attnsr   layerlayer_outputsrs   s                  r#   r6   zCpmAntEncoder.forwardX  s    : #7BD0d!$++. 	2HAu#!m%55!!"3 /#M +8'M< </1	2 --m<-!11/??r$   )NNNNNr   r=   s   @r#   r   r   P  s    8| 8 *.,0(,!%.24@||4@ 4@ ||	4@
  $;4@ #Tk4@ 4@ $;4@ t+4@r$   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CpmAntIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r   r   r   rJ   r   intermediate_sizedense
isinstance
hidden_actstrr   intermediate_act_fnr    s     r#   r   zCpmAntIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r$   r%   returnc                 J    | j                  |      }| j                  |      }|S r   )r   r   r   s     r#   r6   zCpmAntIntermediate.forward  s&    

=100?r$   r7   r8   r9   r   r   r;   r6   r<   r=   s   @r#   r   r     s#    9U\\ ell r$   r   c                        e Zd Zdef fdZdej                  dej                  dej                  dej                  fdZd Zd
d	Z	 xZ
S )CpmAntSegmentPositionEmbeddingr   c                 b   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        t        j                  t        j                  |j                  |j                  z  |j                  z   |j                              | _        y r   )r   r   rF   rG   position_bias_num_bucketsnum_bucketsposition_bias_max_distancemax_distancesegment_typesnum_segmentsr   r   r   r   relative_attention_biasr    s     r#   r   z'CpmAntSegmentPositionEmbedding.__init__  s    33!;;"=="00')||KK$$v';';;f>^>^^**(
$r$   key_pos	query_poskey_segmentquery_segmentc           	      0   t        j                         5  |j                  d      }|j                  d      }|j                  d      }|j                  d      |j                  d      k7  r0t        d|j                  d       d|j                  d       d      ||j                  d      k7  s||j                  d      k7  r!t        d| d|j                  d       d      ||j                  d      k7  r!t        d| d|j                  d       d      |j	                  |d|      }|j	                  ||d      }|j	                  |d|      }|j	                  ||d      }| j                  ||      }|| j                  z   }| j                  t        j                  |t         j                  |j                  	      d d d f   t        j                  |t         j                  |j                  	      d d d f   z
  | j                  | j                  
      }	t        j                  ||k(  |	d d d d d f   |      }d d d        t        j                  | j                        }
|
j!                  dddd      j#                         }
|
S # 1 sw Y   MxY w)Nr   r   z>key_pos.size(0) should be equal to query_pos.size(0), but got z and !z7keylen should be equal to key_segment.size(1), but got z;querylen should be equal to query_segment.size(1), but got r(   r.   r_   )r   r   r   r)   )r   no_gradr,   r-   r`   !_segment_relative_position_bucketr   _position_bucketarangeint32r_   r   whereF	embeddingr   ra   rk   )r!   r   r   r   r   batchkeylenquerylenrelative_position_bucketabsolute_position_bucketembedss              r#   r6   z&CpmAntSegmentPositionEmbedding.forward  s    ]]_ %	LLOE\\!_F ~~a(H||A).."33$TU\UaUabcUdTeejktkykyz{k|j}}~  ))!,,M<N<Nq<Q0Q$MfXUZ[f[k[klm[nZoopq  =--a00$QRZQ[[`anasastuav`wwxy  ll5"f5G!uh;I%**5"f=K)..uhCM'+'M'Mm]h'i$'?$BRBR'R$ (,'<'<V5;;?W?^?^_`dfg`gh,,xu{{C[CbCbcdegkdklm ,,!..	 (= ($ (-{{-(q!4(($C%	P 5t7S7ST1a+668W%	 %	s   H+JJc                 &    || j                   z  |z   S r   )r   )r!   r   r   s      r#   r   z@CpmAntSegmentPositionEmbedding._segment_relative_position_bucket  s    t000;>>r$   c                 .   d}|dz  }|dkD  j                  t        j                        |z  }t        j                  |      }|dz  }||k  }|t        j                  |j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  ||j                  t        j                        |      z  }|S )Nr   r)   r   )
r/   r   r   abslogrj   re   min	full_liker   )r!   relative_positionr   r   relative_buckets	max_exactis_smallrelative_position_if_larges           r#   r   z/CpmAntSegmentPositionEmbedding._position_bucket  s   -155ekkB[P!II&781$	$y0%.II'--/);<hh|i/01Y&( "U[[/	&"
 &+YY&OO6aH&
" 	EKK2C2F2Fu{{2SUoppr$   )       )r7   r8   r9   r   r   r   r;   r6   r   r   r<   r=   s   @r#   r   r     sU    
| 
22 <<2 \\	2
 ||2h? r$   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )CpmAntOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y )N)r   )r   r   r   rJ   r   r   r   	LayerNormlayer_norm_epsrR   hidden_dropout_probrS   r    s     r#   r   zCpmAntOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r$   r%   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   )r   rS   r   )r!   r%   r   s      r#   r6   zCpmAntOutput.forward  s7    

=1]3}|'CDr$   r   r=   s   @r#   r   r     s1    >U\\  RWR^R^ r$   r   c                   V     e Zd ZU eed<   dZ ej                          fd       Z xZ	S )CpmAntPreTrainedModelr   cpmantc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              r7t	        j                  |j                  d| j                  j                         yy)zInitialize the weightsg        )r2   stdN)r   _init_weightsr   r   initones_r   r   normal_r   r   init_std)r!   moduler"   s     r#   r   z#CpmAntPreTrainedModel._init_weights  s]     	f%fo.JJv}}% >?LL77ct{{G[G[\ @r$   )
r7   r8   r9   r   __annotations__base_model_prefixr   r   r   r<   r=   s   @r#   r   r   
  s+     U]]_] ]r$   r   c                        e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 dde	j                  dz  dedz  d	edz  d
edz  dedz  dedz  de	j                  dz  dee	j                     ez  fd       Z xZS )CpmAntModelr   c                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                  |j                  z  z   |j                        | _        t        |      | _        |j                  | _        |j                  | _	        | j                          y r   )r   r   r   encoderr   	Embeddingr   r   segment_embedding
vocab_sizeprompt_typesprompt_lengthinput_embeddingr   rX   	post_initr    s     r#   r   zCpmAntModel.__init__  s     $V,!#f.B.BFDVDV!W!|| 3 3f6J6J JJFL^L^ 
 <FC#11 ++r$   c                     | j                   S r   r  r!   s    r#   get_input_embeddingsz CpmAntModel.get_input_embeddings(  s    ###r$   c                     || _         y r   r
  )r!   
embeddingskwargss      r#   set_input_embeddingsz CpmAntModel.set_input_embeddings+  s
    )r$   c                 *   |j                  d      }|j                  d      }|j                  }t        j                  ||      t        j                  ||      j	                  dd      k  }|d d d d d f   |d d d d d f   j                         |j	                  d||      z  z  }	|	|d d d d d f   |d d d d d f   k(  z  }	t        j                  t        t        || j                  z
              d d d   |      d d d f   j                  |d      |d d d f   k  }
t        j                  t        j                  || j                  |      j                         |
fd      }
|
j	                  ||d      |
j	                  |d|      z  |	z  }	|	S )Nr   r   )r_   r(   rC   )r,   r_   r   r   r`   logical_notrh   listr   r  repeatcatonesru   )r!   	input_idsspancontextlengthr   seqlenr_   directional_mask_2drW   mask_1ds              r#   _prepare_attention_maskz#CpmAntModel._prepare_attention_mask.  s   q!"!!#ll6&AU\\RXagEhEmEmnprsEtt D!,Aq$J++-0C0H0HFTZ0[[
 (44
+;tAq$J?O+OP LLeFT-?-?$?@A$B$GPVWX\^_X_`gghmopqQWo 	 ))UZZt/A/A&QVVXZabhij eVQ7',,uaQW:XX[iir$   Nr  rY   r   rZ   r[   return_dictr\   r   c           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j
                  t        j                  k7  r|j                  t        j                        }|j
                  |j                  }
}	t        j                  |dk7  dd      j                  |	|
      }|dk7  j                  d      j                  |	|
      }t        j                  t        j                  | j                  dz  | j                  z   | j                  dz  | j                  z   |	|
      j!                  |j#                  d      d      |fd      }|j#                         \  }}t        j                  t        j$                  || j                  |	|
      |fd      }t        j&                  ||fd|	|
      }t        j                  ||	|
      j!                  |d      }t        j&                  ||fd|	|
      }|r|t)        | j                   	      }||j+                         nd}|j-                         }| j/                  |      }| j1                  |      }|dk7  r|ddddddf   }||z   }| j3                  ||||      }| j5                  ||||      }|dd|dddf   }|dddd|dddf   }|dd|dddf   }| j7                  ||||||||      \  }}}|dk(  rw|dd| j                  dddf   }|4d
}|D ]+  }||dddd| j                  d| j                  df   fz  }- |}|'d
}|D ]  }||dd| j                  dddf   fz  }  |}|st9        d ||||fD              S t;        ||||      S )ai  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nr   r)   r   r(   r   r   rC   )r   r   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r#   	<genexpr>z&CpmAntModel.forward.<locals>.<genexpr>  s      bcbos   )last_hidden_staterZ   r%   
attentions)r   rY   r   use_return_dictr[   r.   r   r   r/   r_   r   sumr  r   r  r  r  r,   zerosfullr	   get_seq_lengthrk   r  r  r  rX   r  tupler   )r!   r  rY   r   rZ   r[   r  r\   r  r.   r_   segmentr  r   
seq_lengthr  positionr  past_lengthr%   segment_statesrW   rX   r   all_attentionsnew_attentions	attentionnew_hidden_stateshidden_states                                r#   r6   zCpmAntModel.forward@  s   * 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!*!6IDKK<Q<Q	 ??ekk)!U[[1I!)9)9v++i1na366U66RQ,##B'**v*FII&&*T__<&&*T__<!	
 &*A. 
	 &NN,z))U[[0B0B%X^_ahiopq**eZ0!5P<<
%GNNuVWXzz5*-qfM0*$++>O:I:Uo446[\((*	,,Y7//8!+ArsAI6N%655iwPVW**8XwP';<(:;%aKL!&;<%aq&89;?<< 	<
8(. !)!T-?-?-A1*DEM)!#!/ eI"yAt7I7I7KTM_M_Ma1a'b&ddNe!/ ,$&!$5 UL%,q$:L:L:NPQ7Q*R)TT%U$5! )?<M~^   '+++%	
 	
r$   )NNNNNNN)r7   r8   r9   r   r   r  r  r  r   r   r;   ru   r   r,  r   r6   r<   r=   s   @r#   r   r     s    | $*$  *.)-,0(,!%#'.2i
<<$&i
  $;i
 #Tk	i

 i
 $;i
 D[i
 t+i
 
u||	6	6i
 i
r$   r   zy
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                   2    e Zd ZddiZdef fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  de	dz  de
dz  d	e
dz  d
e
dz  dej                  dz  de
dz  dej                  dz  dej                  dz  deej                  z  deez  fd       Zd Zd Z xZS )CpmAntForCausalLMzlm_head.weightzcpmant.input_embedding.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  |j                  |j                  z  z   d      | _
        | j                          y r   )r   r   r   r   r   rJ   r   r  r  r  lm_headr  r    s     r#   r   zCpmAntForCausalLM.__init__  sd     !&) yy 1 1F4G4G&J^J^4^ ^ej
 	r$   Nr  rZ   r[   rY   r   labelsr  rW   r\   logits_to_keepr   c           	         ||n| j                   j                  }| j                  |||||||	      }|r|j                  n|d   }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}|At               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )u<  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss.

        Example:

        Text Generation with CpmAntForCausalLM.
        ```python
        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM

        >>> texts = "今天天气不错，"
        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
        >>> input_ids = tokenizer(texts, return_tensors="pt")
        >>> outputs = model.generate(**input_ids)
        >>> output_texts = tokenizer.batch_decode(outputs)
        >>> print(output_texts)
        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
        ```
        Nr   r(   r   )losslogitsrZ   r%   r&  )r   r'  r   r%  r   intslicer;  r   r`   r,   r   rZ   r%   r&  )r!   r  rZ   r[   rY   r   r<  r  rW   r\   r=  r  model_outputr%   slice_indicesr@  r?  	loss_funcoutputs                      r#   r6   zCpmAntForCausalLM.forward  s   T &1%<k$++B]B]{{ 
 ;F66<XY?8B>SV8W~ot4]kmA}a,?@A(*IV[[V[[_=v{{2ODYab!11F)-)9TGf$EvE%(88&44#..
 	
r$   c                 .    | j                   j                  S r   r   r  r  s    r#   r  z&CpmAntForCausalLM.get_input_embeddings
  s    {{***r$   c                 &    || j                   _        y r   rH  )r!   r  s     r#   r  z&CpmAntForCausalLM.set_input_embeddings  s    &0#r$   )
NNNNNNNNNr   )r7   r8   r9   _tied_weights_keysr   r   r   r   r;   r   ru   rA  r,  r   r6   r  r  r<   r=   s   @r#   r9  r9    s    +,KL|   *.(,!%)-,0&*#'.2.2-.H
<<$&H
 H
 $;	H

  $;H
 #TkH
 t#H
 D[H
 t+H
 t+H
 ell*H
 
'	'H
 H
T+1r$   r9  )r9  r   r   )0r:   re   r   torch.nn.functionalr   
functionalr   torch.nnr    r   r   activationsr   cache_utilsr   r	   
generationr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_cpmantr   
get_loggerr7   loggerModuler   r?   rw   r   r   r   r   r   r   r   r   r   r   r9  __all__r   r$   r#   <module>rZ     su         % & ! . ) O - , . 
		H	%bii 2b#bii b#J4+ryy 4+n")) (		 4RYY 6++RYY ++\<@BII <@@ Y RYY Y z299  ]O ] ] P
' P
 P
f 
\1- \1
\1~ Hr$   