
    
i                     6   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddlm	Z
 ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$  ejJ                  e&      Z' G d dejP                        Z) G d dejP                        Z* G d dejP                        Z+ G d de      Z,e G d de             Z-e G d de-             Z. ed        G d! d"e-e             Z/ ed#        G d$ d%e-             Z0g d&Z1y)'zPyTorch OpenAI ImageGPT model.    N)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D)auto_docstringloggingtorch_float)maybe_autocast   )ImageGPTConfigc                   h     e Zd Zddee   def fdZdej                  dej                  fdZ	 xZ
S )ImageGPTLayerNormhidden_sizeepsc                     t         |           || _        t        j                  t        j                  |            | _        y N)super__init__r   r   	ParametertorchTensorweight)selfr   r   	__class__s      j/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.pyr    zImageGPTLayerNorm.__init__0   s.    ll5<<#<=    tensorreturnc                     |t        j                  t        j                  t        j                  |      dd      | j                  z         z  }|| j
                  z  }|S )NT)axiskeepdim)r"   sqrtmeansquarer   r$   )r%   r)   s     r'   forwardzImageGPTLayerNorm.forward5   sK    %**UZZV0D2W[%\_c_g_g%ghh$++%r(   )gh㈵>)__name__
__module____qualname__tupleintfloatr    r"   r#   r2   __classcell__r&   s   @r'   r   r   /   s5    >E#J >U >
ell u|| r(   r   c                       e Zd Zddedz  dedz  f fdZddZddZd Zd Z		 	 	 	 	 	 	 dd	e
j                  d
edz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  de
j                  dz  defdZ xZS )ImageGPTAttentionNis_cross_attention	layer_idxc           	         t         |           || _        |j                  }| j	                  dt        j                  t        j                  ||ft
        j                              j                  dd||      d       |j                  | _        |j                  | _        | j                  | j                  z  | _        | j                  | _        | j                  | j                  z  | j                  k7  r&t!        d| j                   d| j                   d      |j"                  | _        || _        |j&                  | _        || _        |j*                  | _        | j$                  rNt-        d	| j                  z  | j                        | _        t-        | j                  | j                        | _        n(t-        d
| j                  z  | j                        | _        t-        | j                  | j                        | _        t5        j6                  |j8                        | _        t5        j6                  |j<                        | _        y )Nbiasdtyper   F)
persistentz=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).   r   ) r   r    configmax_position_embeddingsregister_bufferr"   trilonesboolviewr   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr=   scale_attn_by_inverse_layer_idxr>   reorder_and_upcast_attnr   c_attnq_attnc_projr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout)r%   rE   r=   r>   max_positionsr&   s        r'   r    zImageGPTAttention.__init__=   s   66JJuzz=-"@

STYY1m]  	 	
  ++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;r(   c                 6   t        j                  ||j                  dd            }| j                  r |t	        |j                  d      dz        z  }| j                  r|t        | j                  dz         z  }| j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }t        j                  |j                        j                  }	t        j                  |	|j                  |j                        }	t        j                   |||	      }|||z   } t#        j$                  d      |      }|j'                  |j                        }| j)                  |      }t        j                  ||      }
|
|fS )Nr,         ?r   rB   devicedim)r"   matmul	transposerR   r   sizerS   r8   r>   r=   r@   finforB   minr)   rb   wherer   SoftmaxtyperZ   )r%   querykeyvalueattention_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs              r'   _attnzImageGPTAttention._attne   sb   ||E3==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 $((5((6ll<7L((r(   c                 j   |j                         \  }}}}|j                         \  }	}	}
}	t        j                  ||z  ||
t        j                  |j                        }d}| j
                  r |t        |j                  d            dz  z  }| j                  r|t        | j                  dz         z  }t        |j                  j                  d      5  |j                  d||      |j                  dd      j                  d||
      }}t        j                  ||j                         |j                         d	|
      }|j                  ||||
      }d d d        | j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }t        j                   |j"                        j$                  }t        j&                  ||j"                  |j                        }t        j(                  |||      }|||z   } t+        j,                  d      |      }|j"                  t        j                  k7  rt/        d      |j                  |j"                        }| j1                  |      }t        j2                  ||      }||fS # 1 sw Y   SxY w)Nra         ?r,   r`   r   F)enabledr_   r   )betaalpharc   zDError with upcasting, attn_weights does not have dtype torch.float32)rg   r"   emptyfloat32rb   rR   r8   rS   r>   r   rl   reshaperf   baddbmmr=   r@   rh   rB   ri   r)   rj   r   rk   RuntimeErrorrZ   re   )r%   rm   rn   ro   rp   bszrN   	q_seq_lendk_	k_seq_lenrq   scale_factorqkrr   rs   rt   ru   rv   s                       r'   _upcast_and_reordered_attnz,ImageGPTAttention._upcast_and_reordered_attn   sZ   (-

%Y	2 XXZ1i {{3?IyPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L ELL--u= 	V==Y3S]]2r5J5R5RSUWY[d5eqA ==qwwy!'')RS[ghL'//Y	9UL	V
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 .eff#((5((6ll<7L((;	V 	Vs   BJ((J2c                 x    |j                         dd ||fz   } |j                  | }|j                  dddd      S )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr,   r   rD   r   r   )rg   rK   permuter%   r)   rN   attn_head_size	new_shapes        r'   _split_headszImageGPTAttention._split_heads   sE     KKM#2&)^)DD	i(~~aAq))r(   c                     |j                  dddd      j                         }|j                         dd ||z  fz   }|j                  |      S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   rD   r   r   Nr_   )r   
contiguousrg   rK   r   s        r'   _merge_headszImageGPTAttention._merge_heads   sO     1a+668KKM#2&)n*D)FF	{{9%%r(   hidden_states
layer_pastrp   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionscache_positionr*   c	                 |   |d u}	|j                   \  }
}}|St        |t              rA|j                  j	                  | j
                        }|	r|j                  }n|j                  }n|}|	r|n|}|	r%t        | d      st        d      |[rY| j                  |      }j                  | j
                     j                  }|j                  | j
                     j                  }nQ| j                  |      }| j                  |      j                  | j                   d      \  }}|j#                  |
d| j$                  | j&                        j)                  dd      }|j#                  |
d| j$                  | j&                        j)                  dd      }n| j                  |      j                  | j                   d      \  }}}|j#                  |
d| j$                  | j&                        j)                  dd      }|j#                  |
d| j$                  | j&                        j)                  dd      }|D|	s|nd }j+                  ||| j
                  d|i      \  }}|	rd|j                  | j
                  <   |j#                  |
|| j$                  | j&                        j)                  dd      }| j,                  r| j/                  ||||      \  }}n| j1                  ||||      \  }}| j3                  || j$                  | j&                        }| j5                  |      }| j7                  |      }||fS )	NrV   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.rD   rc   r,   r   r   T)shape
isinstancer   
is_updatedgetr>   cross_attention_cacheself_attention_cachehasattrrQ   rV   layerskeysvaluesrU   splitrP   rK   rN   rO   rf   updaterT   r   rw   r   rW   r\   )r%   r   r   rp   r   r   r   r   r   r=   r   seq_lenr   r   curr_past_key_valuescurrent_statesrm   rn   ro   rv   rq   s                        r'   r2   zImageGPTAttention.forward   s    3$>'--Wa!*&9:'2266t~~F
%+5+K+K(+5+J+J('1$2D.-4* t 
 %*M2*11$..AFF,33DNNCJJM2![[8>>tTU>V
UhhsBFPPQRTUV

3DNNDMMJTTUVXYZ $N ; A A$//WX A YE3((3DNNDMMBLLQPQRCJJsBFPPQRTUVE!3E^4N-44S%RbdrQstJC!8<
%%dnn5

3GQQRSUVW''(,(G(GsTY[i(j%K(,

5#un(U%K''T^^T]]Skk+.((5L((r(   )FNr   NNNNFFN)r3   r4   r5   rJ   r7   r    rw   r   r   r   r"   r#   r	   r6   r2   r9   r:   s   @r'   r<   r<   <   s    &<4$; &<SVY]S] &<P )D.)`*& $(.2596:!&)..2C)||C) DLC) t+	C)
  %||d2C) !&t 3C) $;C)  $;C) t+C) 
C)r(   r<   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ImageGPTMLPc                     t         |           |j                  }t        ||      | _        t        ||      | _        t        |j                     | _        t        j                  |j                        | _        y r   )r   r    r   r   c_fcrW   r   activation_functionactr   rX   r[   dropout)r%   intermediate_sizerE   rL   r&   s       r'   r    zImageGPTMLP.__init__  s_    &&	,i8	Y(9:&445zz&"4"45r(   r   r*   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   rW   r   )r%   r   s     r'   r2   zImageGPTMLP.forward  s@    		-0/M2]3r(   )r3   r4   r5   r    r"   r#   r2   r9   r:   s   @r'   r   r     s#    6U\\ ell r(   r   c                        e Zd Zd fd	Z	 	 	 	 	 	 	 ddej
                  dedz  dej
                  dz  dej
                  dz  dej
                  dz  dedz  d	edz  d
ej
                  dz  defdZ	 xZ
S )ImageGPTBlockNc                    t         |           |j                  }|j                  |j                  nd|z  }t	        ||j
                        | _        t        ||      | _        t	        ||j
                        | _	        |j                  r/t        |d|      | _        t	        ||j
                        | _        t        ||      | _        y )N   r   r>   T)r=   r>   )r   r    r   n_innerr   layer_norm_epsilonln_1r<   attnln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)r%   rE   r>   r   	inner_dimr&   s        r'   r    zImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%kv7P7PQ	%f	B	%kv7P7PQ	%%"3Ft_h"iD!2;FD]D]!^Dy&1r(   r   r   rp   r   r   r   r   r   r*   c	           	         |}	| j                  |      }| j                  ||||||      }
|
d   }|
dd  }||	z   }|Xt        | d      st        d|  d      |}	| j	                  |      }| j                  |||||||      }|d   }|	|z   }||dd  z   }|}	| j                  |      }| j                  |      }|	|z   }|f|z   S )N)r   rp   r   r   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   rp   r   r   r   r   )r   r   r   rQ   r   r   r   r   )r%   r   r   rp   r   r   r   r   r   residualattn_outputsrv   outputscross_attn_outputsfeed_forward_hidden_statess                  r'   r2   zImageGPTBlock.forward.  s4    !		-0yy!)/) ! 
 #1oqr"#h. ,4!12 =dV DZ Z  %H ..}=M!%!4!4%-&;'="3- "5 " -Q/K${2M 212 66G 		-0%)XXm%<" #=='))r(   r   r   )r3   r4   r5   r    r"   r#   r	   rJ   r6   r2   r9   r:   s   @r'   r   r     s    2$ $(.2596:!&)..27*||7* DL7* t+	7*
  %||d27* !&t 37* $;7*  $;7* t+7* 
7*r(   r   c                   h     e Zd ZU eed<   dZdZdZdZdgZ	 e
j                          fd       Z xZS )ImageGPTPreTrainedModelrE   transformer	input_ids)imageTr   c           
      ^   t         |   |       t        |t              r||j	                         D ]h  \  }}d|v sd|v st        j                  |d| j                  j                  t        j                  d| j                  j                  z        z         j yt        |t              r|j                  j                  }t        j                  |j                  t!        j"                  t!        j$                  ||ft         j&                              j)                  dd||             yy)	zInitialize the weights.rW   r$   g        rD   )r0   stdrA   r   N)r   _init_weightsr   r   named_parametersinitnormal_rE   initializer_rangemathr/   n_layerr<   rF   copy_r@   r"   rH   rI   rJ   rK   )r%   modulenamepr]   r&   s        r'   r   z%ImageGPTPreTrainedModel._init_weightsq  s     	f% fo.!224 vat#D(8LL$++2O2ORVR[R[\]`d`k`k`s`s\sRt2tuv  12"MMAAMJJ

5::}m&DEJJWX]]q- 3r(   )r3   r4   r5   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr"   no_gradr   r9   r:   s   @r'   r   r   h  sC    %!O!&*#()U]]_ r(   r   c            !           e Zd Zdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  de
dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  dedeez  fd       Z xZS )ImageGPTModelrE   c           	      Z   t         |   |       |j                  | _        t	        j
                  |j                  | j                        | _        t	        j
                  |j                  | j                        | _	        t	        j                  |j                        | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t%        | j                  |j&                        | _        d| _        | j-                          y c c}w )Nr   r   F)r   r    r   rL   r   	Embedding
vocab_sizewterF   wperX   
embd_pdropdrop
ModuleListrangenum_hidden_layersr   hr   r   ln_fgradient_checkpointing	post_init)r%   rE   ir&   s      r'   r    zImageGPTModel.__init__  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklqf Blm%dnn&:S:ST	&+#  ms   
D(c                     | j                   S r   r   )r%   s    r'   get_input_embeddingsz"ImageGPTModel.get_input_embeddings  s    xxr(   c                     || _         y r   r   )r%   new_embeddingss     r'   set_input_embeddingsz"ImageGPTModel.set_input_embeddings  s	    !r(   Nr   past_key_valuesrp   token_type_idsposition_idsinputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   kwargsr*   c                    |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }||t        d      |G| j                  ||       |j                         }|j                  d|d         }|j                  d   }n0|#|j                         dd }|j                  d   }nt        d      ||j                  n|j                  }| j                  r%| j                  r|	rt        j                  d       d}	||j                  d|d         }|	r|t        | j                         }|1||j!                         nd}t#        j$                  |d   |	      |z   }||j'                  d      }|z|dk  rt        d
      |j                  |d      }|ddddddf   }|j)                  | j*                        }d|z
  t#        j,                  | j*                        j.                  z  }| j                   j0                  rE|C|j                         \  }}}||f}|t#        j2                  ||	      }| j5                  |      }nd}|| j7                  |      }| j9                  |      }||j)                  |j                        z   }|| j7                  |      }||z   }| j;                  |      }||j                  d      fz   }|
rdnd}|
r| j                   j0                  rdnd}|rdnd}t=        | j>                        D ]N  \  }}|r||fz   } |||||||	|
|      } | d   }|
s&|| d   fz   }| j                   j0                  sF|| d   fz   }P | jA                  |      } |j                  | }|r||fz   }|stC        d |||||fD              S tE        |||||      S )a;  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTModel
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer,   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rE   )rb   z$batch_size has to be defined and > 0rA   ry    )r   r   r   r   r   rD   c              3   $   K   | ]  }|| 
 y wr   r  ).0vs     r'   	<genexpr>z(ImageGPTModel.forward.<locals>.<genexpr>I  s      = s   )last_hidden_stater   r   
attentionscross_attentions)#rE   r   r   r   use_return_dictrQ   %warn_if_padding_and_no_attention_maskrg   rK   r   rb   r   trainingloggerwarning_oncer
   get_seq_lengthr"   arange	unsqueezetorB   rh   ri   r   rI   invert_attention_maskr   r   r   	enumerater   r   r6   r   )!r%   r   r   rp   r   r   r   r   r   r   r   r   r   r   r  input_shape
batch_sizerb   past_seen_tokensencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr   blockr   s!                                    r'   r2   zImageGPTModel.forward  s]   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T&&4==##p "	%+00[_EN0*$++>O!CRC^==?de+0<<BPV+WZj+jN)33A6L %Q !GHH+00R@N ,AtT1,<=N ,..TZZ.@N!N2ekk$**6M6Q6QQN ;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&%)%?%?@V%W"%)"  HHY/M((<0%(:(:=;O;O(PP% $ 8),==M		-0"m&8&8&<%>>$5b4%64;;;Z;Zr`d"6BD!$&&) 	PHAu#$58H$H!%'=#"3-	G $AJM &9WQZM&I#;;22+?71:-+O('	P* 		-0***L9   1]4D D ':KM`bvw   9+++*1
 	
r(   )NNNNNNNNNNNNN)r3   r4   r5   r   r    r   r   r   r"   r#   r	   rJ   r   r6   r   r2   r9   r:   s   @r'   r   r     s]   ~  "  *.(,.2.2,0-1596:!%)-,0#'.2q
<<$&q
 q
 t+	q

 t+q
 llT)q
 ||d*q
  %||d2q
 !&t 3q
 $;q
  $;q
 #Tkq
 D[q
 t+q
 q
  
:	:!q
 q
r(   r   z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            #           e Zd ZddiZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  de
dz  de
dz  de
dz  dej                  dz  dedeez  f d       Z xZS )ImageGPTForCausalImageModelingzlm_head.weightztransformer.wte.weightrE   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  dz
  d      | _        | j                          y )Nr   Fr@   )
r   r    r   r   r   Linearn_embdr   lm_headr   r%   rE   r&   s     r'   r    z'ImageGPTForCausalImageModeling.__init__a  sL     (0yy0A0AA0EER 	r(   Nr   r   rp   r   r   r   r   r   labelsr   r   r   r   r   r  r*   c                 &   ||n| j                   j                  }| j                  |||||||||
||||      }|d   }| j                  |      }d}|	r|dddddf   j	                         }|	dddf   j	                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a&
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
        >>> import torch
        >>> import matplotlib.pyplot as plt
        >>> import numpy as np

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> # unconditional generation of 8 images
        >>> batch_size = 4
        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
        >>> context = context.to(device)
        >>> output = model.generate(
        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
        ... )

        >>> clusters = image_processor.clusters
        >>> height = image_processor.size["height"]
        >>> width = image_processor.size["width"]

        >>> samples = output[:, 1:].detach().cpu().numpy()
        >>> samples_img = [
        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
        ... ]  # convert color cluster tokens back to pixels
        >>> f, axes = plt.subplots(1, batch_size, dpi=300)

        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
        ...     ax.axis("off")
        ...     ax.imshow(img)
        ```N)r   rp   r   r   r   r   r   r   r   r   r   r   r   .r,   r   )losslogitsr   r   r	  r
  )rE   r  r   r*  r   r   rK   rg   r   r   r   r	  r
  )r%   r   r   rp   r   r   r   r   r   r,  r   r   r   r   r   r  transformer_outputsr   	lm_logitsr.  shift_logitsshift_labelsloss_fctoutputs                           r'   r2   z&ImageGPTForCausalImageModeling.forwardi  sO   L &1%<k$++B]B]"..+))%'"7#9/!5#) / 
 ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r(   )NNNNNNNNNNNNNN)r3   r4   r5   _tied_weights_keysr   r    r   r"   r#   r	   rJ   r   r6   r   r2   r9   r:   s   @r'   r%  r%  X  sx    +,DE~   *.(,.2.2,0-1596:&*!%)-,0#'.2n
<<$&n
 n
 t+	n

 t+n
 llT)n
 ||d*n
  %||d2n
 !&t 3n
 t#n
 $;n
  $;n
 #Tkn
 D[n
 t+n
  !n
" 
2	2#n
 n
r(   r%  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                   B    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	dz  de	dz  de	dz  de	dz  de
deez  fd       Z xZS )ImageGPTForImageClassificationrE   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y )NFr'  )
r   r    
num_labelsr   r   r   r(  r)  scorer   r+  s     r'   r    z'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r(   Nr   r   rp   r   r   r   r,  r   r   r   r   r  r*   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }|j                  d      }| j	                  |      }d}|| j                  ||| j                         }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```N)	r   rp   r   r   r   r   r   r   r   r   r   rc   )r.  r/  r   r   r	  )
rE   r  r   r0   r;  loss_functionr   r   r   r	  )r%   r   r   rp   r   r   r   r,  r   r   r   r   r  r0  r   pooled_hidden_statesr/  r.  r5  s                      r'   r2   z&ImageGPTForImageClassification.forward  s    f &1%<k$++B]B]"..+))%'/!5# / 
 ,A.,11a1801%%ffdkkBDY!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r(   )NNNNNNNNNNN)r3   r4   r5   r   r    r   r"   r#   r	   rJ   r   r6   r   r2   r9   r:   s   @r'   r8  r8    s%   ~   *.(,.2.2,0-1&*!%)-,0#'T
<<$&T
 T
 t+	T

 t+T
 llT)T
 ||d*T
 t#T
 $;T
  $;T
 #TkT
 D[T
 T
 
1	1T
 T
r(   r8  )r%  r8  r   r   )2__doc__r   typingr   r"   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   utils.genericr   configuration_imagegptr   
get_loggerr3   r  Moduler   r<   r   r   r   r   r%  r8  __all__r  r(   r'   <module>rP     s?   %     % & ! C C ) 9 
 . # 
 , 2 
		H	%
		 
N)		 N)b")) "G*. G*T o  D I
+ I
 I
X z
%<o z
z
z _
%< _
_
Dr(   