
    謜i7                       d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dl	mZ d dlmZmZmZmZmZ d dlZd dlmZ d d	lmZ d d
l m Z  d dl!Z!d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 erd dl!m2Z2m3Z3m4Z4 ddl5m6Z6  e(       rd dl7Z7 e,       rd dl8m9Z9  e*       xr  e'       xr  e+       xr  e)       Z:e:rd dl;Z;d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZR d dlNmMZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZg d dlhmiZimjZjmkZk  G d d egd!"      Zl G d# d$eUd!"      Zm G d% d&eGd!"      Zn ejel      Zo ejem      Zp ejen      Zqh d'Zrh d(Zsh d)Zt e1j                  ev      Zwd*d+d,d-iZx eyexj                               Z{d.Z|d/ Z}d0 Z~d1 Z G d2 d3ej                         Zd4ed5e$d6e$fd7Z G d8 d9      Z G d: d;      Z G d< d=      Zd>e_        evd?k(  r e       Zyy)@    N)	GeneratorIterable)asynccontextmanager)	lru_cache)BytesIO)Thread)TYPE_CHECKING	AnnotatedOptional	TypedDictUnion)scan_cache_dir)DecodeStream)tqdm)AutoTokenizerBitsAndBytesConfigGenerationConfigPreTrainedTokenizerBase)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )LogitsProcessorListTextIteratorStreamer)logging)PreTrainedModelPreTrainedTokenizerFastProcessorMixin)ContinuousBatchingManager)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionChatCompletionMessageChatCompletionMessageParam)Choice)ChatCompletionChunkChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                       e Zd ZU dZeed<   y))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__     R/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/cli/serve.pyrH   rH   s       	 rR   rH   F)totalc                       e Zd ZU dZeed<   y)+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rI   NrJ   rQ   rR   rS   rW   rW   z   rT   rR   rW   c                   4    e Zd ZU dZeed<   eed<   dZeed<   y)%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerI   FstreamN)	rK   rL   rM   rN   bytesrP   rO   r[   boolrQ   rR   rS   rY   rY      s    	 rR   rY   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopr_   audior`   logprobsmetadata	functions
modalities
predictionrf   rg   rh   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   ra   rb   languageru   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                 .    dd l }|j                  |        y Nr   )torchmanual_seed)_seedr   s     rS   set_torch_seedr      s    	erR   c                  v    dd l } | j                  j                         r| j                  j                          y y r   )r   cudais_availableempty_cache)r   s    rS   reset_torch_cacher      s*    zz 

  !rR   c                 ,    dd l }|j                  |       S r   )r   	ones_like)_input_tensorr   s     rS   torch_ones_liker      s    ??=))rR   c                       e Zd ZdZdZdZdZy)ModalityLLMVLMSTTTTSN)rK   rL   rM   r   r   r   r   rQ   rR   rS   r   r      s    
C
C
C
CrR   r   reqmodel_generation_configreturnc                 L   | j                  d      "t        di t        j                  | d         }nt	        j
                  |      } |j                  di |}|j                         D ]  \  }}|	t        |||        | j                  d      t        | d         |_
        | j                  d      t        | d         |_
        | j                  d      t        | d         |_        | j                  d      
| d   |_        | j                  d      
| d   |_        | j                  d      +t        | d         |_        t        | d         dk(  rd	|_        | j                  d
      t        | d
         |_        | j                  d      t%        | d          |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rI   max_output_tokens
max_tokensfrequency_penalty
logit_biasrl   temperatureg        Ftop_pseedrQ   )getr   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   r   )r   r   kwargsrI   non_standard_kwargskvs          rS   !create_generation_config_from_reqr      s   . ww"#/,Ttzz#>Q:R/ST MM*AB2+22<V<#))+ -1=%q!,-
 ww"#/+.s3F/G+H( ww|(+.s</@+A(
ww"#//4S9L5M/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A%]#$+*/'
www#"'G"5
wwv"s6{#rR   c                       e Zd ZdZd Zd Zy)	ToolStatez7Lightweight class to keep track of the tool call state.c                 $    | j                          y N)resetselfs    rS   __init__zToolState.__init__)  s    

rR   c                 <    d| _         d| _        d| _        d| _        y)z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    rS   r   zToolState.reset,  s!     %%*"!"rR   N)rK   rL   rM   rN   r   r   rQ   rR   rS   r   r   &  s    ArR   r   c                   L    e Zd ZdZ	 ddddeded   dz  fdZd	 Zd
 Zd Z	d Z
y)
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr   timeout_seconds	processor)r!   r    c                     || _         t        |j                        | _        || _        || _        t        j                  | j
                  | j                        | _	        | j                  j                          y r   )r   rO   name_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerr   )r   r   r   r   s       rS   r   zTimedModel.__init__:  s[     
 !3!34".ood&:&:D<P<PQrR   c                     | j                   j                          t        j                  | j                  | j
                        | _         | j                   j                          y)z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   s    rS   reset_timerzTimedModel.reset_timerG  s@    ood&:&:D<P<PQrR   c                     t        | d      rX| j                  K| `| `d| _        d| _        t        j                          t                | j                  j                          yyy)z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   r   r   r   s    rS   delete_modelzTimedModel.delete_modelM  sX    4!djj&<
DJ!DNJJL  KK  '=!rR   c                     | j                   dkD  r@| j                          t        j                  | j                   d| j                    d       y y )Nr   z was removed from memory after z seconds of inactivity)r   r   loggerinfor   r   s    rS   r   zTimedModel.timeout_reached\  sM    !#KK%%&&EdFZFZE[[qr $rR   c                 <    t        | d       xs | j                  du S )z)Check if the instances have been deleted.r   N)r   r   r   s    rS   
is_deletedzTimedModel.is_deletedc  s     4))?TZZ4-??rR   r   )rK   rL   rM   rN   r   r   r   r   r   r   r   rQ   rR   rS   r   r   4  sO     PT	   DEL	!@rR   r   c            $       N   e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dQdee ej                  d      f   dee ej                  d      f   deedz   ej                  d      f   d	ee ej                  d
      f   deedz   ej                  d      f   deedz   ej                  d      f   dee ej                  d      f   dee ej                  d      f   dee ej                  d      f   dee ej                  d      f   deedz   ej                  d      f   dee ej                  d      f   dee ej                  d      f   deedz   ej                  d      f   dee ej                  d d!"      f   d#df d$Z	d% Z
d& Zd'ed(ed)ed*efd+Zd'efd,Zd'efd-Zd'efd.Z	 	 	 	 	 	 	 	 dRd/ed0edz  d1edz  d2edz  d3edz  d4ee   dz  d5edz  d6ed7   d#efd8Zed9eez  d#efd:       ZeedSd;edz  d#eeeef      fd<              Zd=ed/ed#e e!z  fd>Z"edSd1d?d#e#fd@       Z$edAe#fdB       Z%d=ed#e e!z  fdCZ&d=ed#e'eddf   fdDZ(d=ed#efdEZ)d=ed#e'eddf   fdFZ*d=ed#efdGZ+d#e,dz  fdHZ-dIed#efdJZ.dKefdLZ/dKed#e0dM   fdNZ1dKed#e0dO   fdPZ2y)TServeNcontinuous_batchingz8Whether to use continuous batching for chat completions.)helpdevicezgDevice to use for inference; will default to `auto` and place the model on an accelerator if available.dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.trust_remote_codez2Whether to trust remote code when loading a model.attn_implementationzWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.quantizationzAWhich quantization method to use. choices: 'bnb-4bit', 'bnb-8bit'hostz$Interface the server will listen to.portzPort the server will listen to.model_timeoutz@Time in seconds after which a model will be removed from memory.	log_levelz8Logging level as a string. Example: 'info' or 'warning'.default_seedz1The default seed for torch, should be an integer.enable_corsztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.input_validationz+Whether to turn on strict input validation.force_modelzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.non_blockingTz/Whether to run the server in a separate thread.)hiddenr   r   c                     t         st        d      | _        | _        | _        | _        | _        | _        | _        | _	        |	 _
        |
 _        | _        | _        | _        | _        | _        |t#        |       t%        j&                  d      }|j)                  t$        j*                  |
j-                                   t%        j&                  d      }|j)                  t$        j*                  |
j-                                   i  _        d  _        d  _        d  _        d  _         j                   j                  rdnd _
         j                  r3 j9                   j                        }| _         j;                  |       t<        dt>        f fd       }t?        |      } j                  r2|jA                  tB        d	gd
d	gd	g       tD        jG                  d       ddl$m%} |jM                  d      d|dtN        f fd       }|jM                  d      dtN        f fd       }|jM                  d      d|f fd       }|jQ                  d      |jS                  d       fd              }|jS                  d      d        }|jU                  d      d|fd       }tW        jX                  | j                   j                   j                        }tW        jZ                  |       _.         j                   r j_                          y  j\                  ja                          y )NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`transformersz+transformers.generation.continuous_batching,  appc                   K   d  j                   j                         D ]  }|j                           j                  j                  j	                  dd       y y w)NT   blocktimeout)loaded_modelsvaluesr   #running_continuous_batching_managerrl   )r   r   r   s     rS   lifespanz Serve.__init__.<locals>.lifespan  s`     ++224 %""$%77C88==DRS=T Ds   A A#)r   *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsrequestbodyc                     j                  |       j                  r&j                  || j                  j                        S j                  |      S )Nr  ) validate_chat_completion_requestr   #continuous_batching_chat_completionstate
request_idgenerate_chat_completion)r  r  r   s     rS   chat_completionz'Serve.__init__.<locals>.chat_completion  sK    11$1?''??gmmF^F^__44T::rR   z/v1/responsesc                     j                  |        | j                  dd      }|sj                  |       }t        |      S j	                  |       }t        |d      S )Nr  r[   Ttext/event-stream
media_type)validate_response_requestr   generate_response_non_streamingr'   generate_responser(   )r  r[   response_objoutputr   s       rS   	responsesz!Serve.__init__.<locals>.responses  s`    **7*;[[40F#CCGL#L11++G4F$V8KLLrR   z/v1/audio/transcriptionsc           
        K   | j                         4 d {   }t        |d   j                          d {   |d         }t        j	                  d|d   j
                   d|d   j                   d|d   j                  dz  dd	       d d d       d {    j                  
       j                  |      }t        |d      S 7 7 7 8# 1 d {  7  sw Y   HxY ww)NrZ   r   )rZ   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr  r  r  )formrY   readr   debugfilenamecontent_typesizevalidate_transcription_requestgenerate_transcriptionr(   )r  r  parsed_requestr  r   s       rS   audio_transcriptionsz,Serve.__init__.<locals>.audio_transcriptions  s      ||~ 	 	!F#F|0022w-"
 %d6l&;&;%<M$v,JcJcId e!&\..5c:$@	 	 ///G00@F$V8KLL	2	 	 	 	sU   C+CC+CC
ACC+C5C+CC+C(CC($C+z
/v1/modelsc                  <    t        d j                         d      S )Nlist)objectdata)r'   get_gen_modelsr   s   rS   get_all_modelsz&Serve.__init__.<locals>.get_all_models  s      64;N;N;P QRRrR   z/healthc                      t        ddi      S )Nstatusok)r'   rQ   rR   rS   healthcheckz#Serve.__init__.<locals>.healthcheck  s    4 011rR   httpc                    K   | j                   j                  t              xs t        t	        j
                               }|| j                  _         ||        d {   }||j                   t        <   |S 7 wr   )headersr   X_REQUEST_IDrO   uuiduuid4r	  r
  )r  	call_nextr
  responses       rS   get_or_set_request_idz-Serve.__init__.<locals>.get_or_set_request_id#  s]      ,,\:Oc$**,>OJ'1GMM$&w//H-7H\*O 0s   AA9A7A9)r   r   r   )1serve_dependencies_availableImportErrorr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
get_loggersetLevel
log_levelslowerr   r   last_messageslast_kv_cache
last_modelprocess_model_nameload_model_and_processorr   r$   add_middlewarer&   r   warning_oncefastapir  postdictoptionsr   
middlewareuvicornConfigServerserverstart_serverrun)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   transformers_logger	cb_loggermodel_id_and_revisionr   r   r  r  r  r"  r(  r,  r5  configs   `                            rS   r   zServe.__init__k  s3   t ,s 
 $7 
!2#6 (		*"(& 0&( #<( &00@$$W%7%7	8I%JK&&'TU	7--ioo.?@A 57UY0 "!%'+'7'7SD$($;$;D<L<L$M!3DO))*?@		U 	U 
	U x( "e"&"e"e   g 	$	(	)	;W 	;D 	; 
*	; 
/	"		Mt 		M 
#		M 
,	-	M 	M 
.	M" 
\	"			S 
 
#	S 
		2 
	2 
		 	 
 	 $))$))t~~^nnV,KKOOrR   c                 ~      fd}t        j                  |dd       _         j                  j                          y )Nc                      t        j                          _        t        j                   j                          j                  j	                   j
                  j                                y r   )asyncionew_event_loop_loopset_event_looprun_until_completerK  server   s   rS   _runz Serve.start_server.<locals>._run4  sD     //1DJ""4::.JJ))$++*;*;*=>rR   zuvicorn-threadF)targetnamedaemon)r   r   _threadr   )r   rZ  s   ` rS   rL  zServe.start_server3  s2    	? !''t:JSXYrR   c                 &   | j                   st        d      | j                   j                         st        d      d| j                  _        | j                   r8| j                   j                         r| j                   j                  d       y y y )NzHThe server cannot be killed as it was not launched in a separate thread.zThe server is already killed.Tr   )r   )r^  
ValueErroris_aliverK  should_exitjoinr   s    rS   kill_serverzServe.kill_server=  so    ||ghh||$$&<=="&<<DLL113LLa( 4<rR   r  schema	validatorunused_fieldsc                    t         j                  d|        t        |j                               }|j                  }||z
  }|r(t         j                  d|        t        dd|       | j                  rB	 |j                  |       ||z  }	|	r(t         j                  d|	        t        dd|	       yy# t        $ rF}t         j                  d|j                                 t        d|j                               d}~ww xY w)a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   r  setkeys__mutable_keys__errorr%   r   validate_pythonrF   errors)
r   r  re  rf  rg  
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             rS   _validate_requestzServe._validate_requestH  s   . 	+G956 (
//$}4LL;O;LMNC:Z[jZk8lmm  H))'2 (2M'A$'=>V=WXY# #.LMeLf,g  ( ! # H1!((*>?#AHHJGGHs   :B< <	DADDc                 F    | j                  |t        t        t               y N)r  re  rf  rg  )rv  rH   response_validatorUNUSED_RESPONSE_FIELDSr   r  s     rS   r  zServe.validate_response_requesty  s!    <(0	 	 	
rR   c                 F    | j                  |t        t        t               y rx  )rv  rW   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr{  s     rS   r  z&Serve.validate_chat_completion_request  s!    >*7	 	 	
rR   c                 F    | j                  |t        t        t               y rx  )rv  rY   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr{  s     rS   r  z$Serve.validate_transcription_request  s!    8-5	 	 	
rR   r
  contentr   rolefinish_reason
tool_callsdecode_stream	tokenizerr    c	                     | |||j                  |j                  |      }t        |t        t	        j                               |t        t        |||      d|      gdd      }	|	S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        )r  r  r  r   )deltaindexr  r   zchat.completion.chunk)idcreatedr   choicessystem_fingerprintr%  )step
_tokenizerr/   r   timeChoiceChunkr0   )
r   r
  r  r   r  r  r  r  r  chunks
             rS   build_chat_completion_chunkz!Serve.build_chat_completion_chunk  s    D $)<AV#(()=)=wGG#		$% '!#-
 "/
  "*!
& rR   r  c                 .    d| j                  d       dS )a/  
        Builds an event of a streaming OpenAI Response model or a ChatCompletion chunk.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            chunk (`BaseModel` or `ChatCompletionChunk`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        zdata: Texclude_nonez

)model_dump_json)r  s    rS   chunk_to_sse_elementzServe.chunk_to_sse_element  s"     --4-@AFFrR   	cache_dirc           	         ddl m}m} g }t        j	                  d       t        t        |       j                        D ]@  }|j                  dk7  r|j                  }|j                         D ]  \  }}|j                  }t        d |D        d      }	|	s)t        j                  |	j                         j!                               }
t#        |
t$              rd|
v so|
d   }|j'                         |j'                         t)        fd|D              sd	|j*                  v r|j*                  j-                  d	      nd
}|j*                  |dk7  rd| nd
z   }|j/                  ||d|j0                  d        C |S )z2
        List LLMs and VLMs in the cache.
        r   !MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESz/Scanning the cache directory for LLMs and VLMs.r   c              3   T   K   | ]   }|j                   d k(  s|j                   " yw)zconfig.jsonN)	file_name	file_path).0fs     rS   	<genexpr>z'Serve.get_gen_models.<locals>.<genexpr>  s      #_A!++Q^B^AKK#_s   ((Narchitecturesc              3   4   K   | ]  }|g v s|  y wr   rQ   )r  archllmsvlmss     rS   r  z'Serve.get_gen_models.<locals>.<genexpr>  s      P4$9OtPs   /r   main@)owned_byr  r%  r  )&transformers.models.auto.modeling_autor  r  r   warningr   r   repos	repo_typerefsr   filesnextr   r   openr  
isinstancerE  r   anyrepo_idsplitappendlast_modified)r  r  r  generative_modelsrepor  refrevision_infor  config_pathrQ  r  authorrepo_handler  r  s                 @@rS   r'  zServe.get_gen_models  sg   	

 HI	2889 	D~~(99D&*jjl "]%++"#_#_aef"K$4$4$6$;$;$=>"640_5N & 78??AAHHJPPP8;t||8KT\\//4QSF"&,,sf}AcU)RT"UK%,,(."-&-'+'9'9	'	B ! rR   r   c           	         
  j                  |d          j                  k7  } _        |r0 j                  $ j                  j                  dd       d _         j	                        \  }}t        |d      r|j                  n|t        ||j                  j                  j                  ddd	       j                  J|j                  
       _        t                j                  _         j                  j                          |j                  |d   ddd      j!                  |j"                        d   d    fd fd
 fd}
 fd} j                  j%                  |j&                  |j)                  d            }|j)                  d      rt+         ||      d      S  ||      }|j-                  d      }	t/        |	d      S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   r   r  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rI   messagespt)return_tensorsadd_generation_promptreturn_dict	input_idsr   c              3     K   ddl m} 	 j                  | d       d}j                  j	                  |       D ]  }|dz  }|j
                  r'|j
                  d   }j                  | ||       |j                  |j                  k(  sU|
j                  k\  }t        d	      r|j                  k(  }|xr | }|rd
nd}j                  | |        y  y # t        $ rT}	t        j                  t        |	             j                  j                  |        dt        |	       d Y d }	~	y d }	~	ww xY ww)Nr   )RequestStatus	assistantr  r   r      r   )r
  r  r   r  r  	eos_tokenlengthrl   r  r   data: {"error": ""})generation.continuous_batchingr  r  r   request_id_itergenerated_tokensr*  FINISHEDr   r   r  	Exceptionr   rn  rO   cancel_request)r
  r  r  n_tokens_generatedresulttoken_idgenerated_all_tokensfinal_token_is_eosreasonrt  rI   rP  r   r  s             rS   stream_chat_completionzIServe.continuous_batching_chat_completion.<locals>.stream_chat_completion>  sg    F(7 66z[p6qq%&""FFVVWab F&!+& ..#)#:#:2#>">>'1$,"7*7&/ ?   }}(>(>>/AEVEeEe/e, #9k:179;N;N1N.3G3bPbLb0-Av">>&*0"7 ?  
 ;>  7SV$88GG
S*3q6(#667sC   EBC$ AC$ !E"C$ #E$	E-A
D<7E<EEc                 t   d }j                   j                         r<|:j                   j                  | d      }j                   j                         r|:j                  |j                        }t        | t        t        j                               dt        dt        |d      d      g	      }|S )
Nr  )r
  r   chat.completionr   r  r  r  rl   r  messager  )r  r  r%  r   r  )
r   
is_running
get_resultdecoder  r+   r   r  r.   r,   )_request_idr  r  chat_completion_resultrP  r   r  s       rS   buffer_chat_completionzIServe.continuous_batching_chat_completion.<locals>.buffer_chat_completionk  s    F::EEGFNAALLXcmnLo ::EEGFN  &&v'>'>?G%3DIIK((+ 5gK X&,	&"" *)rR   c                `  K   	 t        j                         d      } | |      D ]3  }j                  |       t        j                  d       d {    5 y 7 # t        j
                  $ r7 j                  j                  |        t        j                  d|  d       Y y w xY ww)NFr   Request  was cancelled.)
r   tolistr  rT  sleepCancelledErrorr   r  r   r  )r  r  _chunkinputsr   r  s      rS   cancellation_wrapper_streamzNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_stream  s     H ,V]]_e D4[-P +F33F;;!--***+*)) H88GGT+oFGHs<   B.AA! AA! B.A! !AB+(B.*B++B.c                     	  |       S # t         j                  $ r7 j                  j                  |        t        j                  d|  d       Y y w xY w)Nr  r  )rT  r  r   r  r   r  )r  r  r   s    rS   cancellation_wrapper_bufferzNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_buffer  sT    H-k::)) H88GGT+oFGHs    AAAr[   )r
  r   	streamingr  r  r  application/json)r?  r>  r   rl   r@  r   r  r   rI   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   add_requestr   r   r(   r  r'   )r   r   r
  must_discard_cacher   r   r  r  r  
json_chunkr  rI   r  rP  r  r  s   `         @@@@@@rS   r  z)Serve.continuous_batching_chat_completion  s    !% 7 7G E2dooE/ 77C88==DRS=T;?8889NOy+29k+JI''PY	=$)$;$;"//"//
 33;7<7U7U"3 8V 8D4
 H[G\D44D44::< ..
ODZ^ / 

"U\\
;(()++	7Z	*4		H	H ==IIz:K:Z:Zfifmfmnvfw J 

 778$%@%LYlmm/
;E..D.AJ
7IJJrR   r   c                 .   | t        |t              rt        j                  S ddlm}m} | j                  j                  }||j                         v rt        j                  }|S ||j                         v rt        j                  }|S t        d|       )Nr   r  zUnknown modality: )r  r   r   r   r  r  r  	__class__rK   r   r   r`  )r   r   r  r  model_classnamemodalitys         rS   get_model_modalityzServe.get_model_modality  s     )%<=||#	

  //22HOOQQ||H   A H H JJ||H  1/1BCDDrR   r  c           	         g }| D ]  }|d   g d}|t         j                  k(  rmt        |d   t              r|d   }nMt        |d   t              r:g }|d   D ]  }|d   dk(  s|j                  |d          ! dj                  |      }|d<   n(|t         j                  k(  rt        |d   t              r|d   j                  d|d   d       n|d   D ]  }|d   dk(  r|d   j                  |        |d   dk(  s)d	|d   d
   v rt        j                  dd|d   d
         }t        j                  t        t        j                  |                  }t        j                   dd      }	|	j"                  }
|j%                  |	j"                         n|d   d
   }
|d   j                  d|
d        |j                  |        |S )Nr  r  r  r  typer^    )r  r^   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   r  rO   r$  r  rc  r   resubr#   r  r   r  	b64decodetempfileNamedTemporaryFiler\  save)r  r  processor_inputsr  parsed_messageparsed_contentr  
image_datar  rZ   r  s              rS   *get_processor_inputs_from_inbound_messagesz0Serve.get_processor_inputs_from_inbound_messages  s    '	4G&-fo"EN8<<' gi0#6%,Y%7N	 2D9%'N#*9#5 C"6?f4*11'&/BC &)XXn%=N,:y)X\\) gi0#6"9-44fgV_N`5ab#*9#5 \"6?f4*95<<WE$V_;'7;+?+FF-/VV4LbRYZeRfglRm-n
(-

76;K;KJ;W3X(Y'/'B'B&Y^'_&*ii %

499 5&-k&:5&A*95<<gVY=Z[\  ##N3O'	4P  rR   c                 4     j                    j                   |d<   |d   }|d   d   dk(  ry j                  |d          j                  k7  } _         j                        \  } j	                  |      } j                  ||      }dt        D ]/  }|j                  j                  d   j                         v s-| n |j                  |d	|j                  d
      dd	d	      }|j                  j                        }|j                  dd      d	}	dj                  j                  d   j                         v rd}	t        ||	d	      }
t        |j                         d} j#                  |      r=|s; j$                  j'                         }|d   j(                  d   |kD  r j$                  }i ||
d	|d fd}|j                  d      r(t+        t-         j.                   ||
            d      S g }d} ||
      }d}|D ]  }|j0                  d   }t3        |j4                  dd      r%|j7                  |j4                  j8                         |j:                  r|j:                  }t3        |dd      st|j<                  } t?        tA        tC        jB                               dtE        dtG        djI                  |      d      |      g|       }|jK                  d	!      }tM        |d"      S )#a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r   r  r  )r   r   Ttoolsr  )r  r   r  r  tokenizer
  req_0gptossFskip_special_tokensskip_promptr   r  )streamerrI   return_dict_in_generatepast_key_valuesc              3     K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}	 |j	                          t               }j                  d	
       d}d}	| D ](  }|	dz  }	dj                   j                  d   j                         v r|j                  d      }||z  }|r||v rd}QR|j                         t           d   k(  rd|_
        z|j                         t           d   k(  r(|j                          j                  |d d       |j                  r@|xj                  |z  c_        |j                  sYt        j                  d|j                        }
|
|
j!                  d      }
d|_        t#        t%        |
      dd|dz         }n|dk(  rMd|j                  vr]|xj&                  |j)                  d      z  c_        |xj&                  |j)                  d      z  c_        |j&                  dk  r&dj+                  |j-                  d      d d       dz   }t#        t%        |      dd      }j                  |d |g       |dk7  sj                  ||       + |	j.                  k\  }t1        | j2                  d      r || j2                  j4                  k(  }|xr | }|rdnd }j                  ||!       |j+                          |j+                          y # t6        $ r9}t8        j;                  t=        |             d"t=        |       d# Y d }~Nd }~ww xY w# |j+                          w xY ww)$NFr#  r   T<|channel|>final<|message|>c                  L     j                   di | }|j                  _        y NrQ   generater*  r=  r   generate_outputr   r   s     rS   generate_with_cachez[Serve.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cache?  $    "0%..":6":%4%D%D"rR   r[  r   r   r  r  r  
<|return|>r   r   r  )r
  r  r  r   z\"name\": \"(.*?)\")r\  function
_tool_call)r7  r  r  r  z"arguments": {{})	arguments)r7  r  r  )r
  r  r  r   )r  r   r  r  rl   r  r  r  )rQ  r  r;  r   r   r   r  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr1   r2   r   countrc  r  r   r   r  r  r  r   rn  rO   )r(  r  
filter_cotcot_trace_endr3  threadresults
tool_stater  r  	tool_nametoolr  r  r  rt  rI   generation_kwargsr   rP  r
  r   tool_model_familys                   rS   r  z>Serve.generate_chat_completion.<locals>.stream_chat_completion5  s     J M5<<55a8>>@@!
 =E #6?PQFGy&[
 66z[p6qq%&"& [F&!+&  5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G "(G3).J$$ )4!<<>->?P-QRY-ZZ:>J7$ "<<>->?P-QRW-XX&,,."&"B"B+6%).:&;	 #C #  %%66&--7- $.#C#C,.II6LjN_N_,`	#,#4$,090BICG
 @':-Hi-X*+)3'2\'A	(" $*R<$, $4:;L;L#L$, !+ < <S@Q Q < * < <S@Q Q <#-#?#?!#C-/WWV\\#5Fs5K-Ls-RF':-HSY-Z*+)3(" #'"B"B+6%),06&;	 #C #  % |">>'?T ?  s[z (:=N=]=]']$ 8--{;)/83E3E3O3O)O&+?+ZHZDZ(%9v66{RX`u6vv   7SV$*3q6(#667
 sD   AN	IL/ B
L/ N	/	M18/M,'M4 ,M11M4 4NN	r[   r  r  rl   r  usager  r   r  r  )r  r  r%  r   r  rL  r  r  )'r   r?  r>  r@  r	  r  _MODELS_WITH_TOOL_SUPPORTrQ  r  r;  r   r   r  r   r   r   rI   is_continuationr=  get_seq_lengthshaper(   mapr  r  getattrr  r  r  r  rL  r+   r   r  r.   r,   rc  
model_dumpr'   )r   r   r  r  r   r  r  supported_model_familiesr  r%  generation_streamerr=  seq_lenr  r  r  	generatorrL  r  choicer  r  rI   rJ  r   rP  r
  rK  s   `                     @@@@@@rS   r  zServe.generate_chat_completion  sH    '++CL9<Z B<;. $ 7 7G E2dooE/889NOy**5I*FJJ8U]^ !(A 	$'5<<+E+Ea+H+N+N+PP$<!	 .."&'''" / 
 5<<(WW\73
 #u||11!4::<<"'2 3

 >c[`[r[rs$-?((779Gk"((,w6 $ 2 2

+!2'+,
J	 J	X 778$D--/EFY[e/fg. 
 G"M./BJOIE" (q)6<<D9NN6<<#7#78''$*$8$8M5'40!KKE( &4DIIK((+ 5bggg>NU` a&3	 &"" ,66D6IF3EFFrR   c                 0     j                  d          j                  k7  } _         j                        \  }t        d   t              r'dv r	dd   dgng }|j                  dd   d       nt        d   t              r8dv r.d   d   d   dk7  rdd   dgd   }nYd   }d   |d   d	<   nHd   }nBt        d   t              r$dv r	dd   dgng }|j                  d          nt        d
      |j                  |ddd      d   }|j                  j                        }j                  dd      d}dj                  j                  d   j                         v rd}t!        ||d      }t#        j$                        }d} j'                        r=|s; j(                  j+                         }	|d   j,                  d   |	kD  r j(                  }|t/        |      ||d|d fd}
 |
|      S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  r_   r   r  r  %inputs should be a list, dict, or strTr  r  r  r  r  rj   r"  r#  Fr$  r'  Nr   )r  attention_maskr(  rI   r)  r*  c              3   ~	  K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}d}d}	 |j	                          t        j
                         }	t        d|t        d	 |	d
j                  d      dddiidg g j                  dd      dj                  d                  }
|dz  }j                  |
       t        d|t        d	 |	dj                  d      dddiidg g j                  dd      dj                  d                  }|dz  }j                  |       t        d||t        d dddg             }|dz  }j                  |       t        dd |||t        dd g !      "      }|dz  }j                  |       d }| D ]  }dj                   j                  d   j                         v r|j                  d#      }||z  }|r7||v rd}d }Mt!        d$d ||||g %      }|dz  }j                  |       {|s~t!        d$d ||||g %      }|dz  }j                  |        t#        d&d ||d|g '      }|dz  }j                  |       t%        d(d |||t        d|j&                  g !      "      }|dz  }|dz  }j                  |       t)        d)||t        d dd*d|j*                  gg +            }|dz  }|dz  }j                  |       t-        d,|t        d	 |	d*j                  d      dddii|j.                  gdg j                  dd      dj                  d      -            }|dz  }j                  |       |j1                          |j1                          y # t2        $ r}t4        j7                  d.t9        |              t;        d/|t9        |      0      }|dz  }j                  |       t=        d1|t        d	 	d2j                  d      dddiig dg ddj                  d      t?        d3t9        |      4      5            }|dz  }j                  |       Y d }~d }~ww xY w# |j1                          w xY ww)6NFr#  r   Tr,  c                  L     j                   di | }|j                  _        y r.  r/  r1  s     rS   r3  zMServe.generate_response.<locals>.stream_response.<locals>.generate_with_cache8  r4  rR   r5  zresponse.createdresp_queuedr[  formatr  r^   r4  ry   autoro   )r  
created_atr*  r   r[  r^   r%  r   r  ry   rf   ro   )r  sequence_numberr4  r  zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )r  r  r*  r  r  )r  rg  output_indexitemzresponse.content_part.addedoutput_textr   r  r^   annotations)r  item_idrg  rj  content_indexpartr6  zresponse.output_text.delta)r  ro  rg  rj  rp  r  rn   zresponse.output_text.done)r  ro  rg  rj  rp  r^   rn   zresponse.content_part.donezresponse.output_item.done	completedr  r  r*  r  r  rn  zresponse.completedr  rf  r*  r   r[  r^   r  r%  r   ry   rf   ro   z"Exception in response generation: rn  )r  rg  r  zresponse.failedfailedserver_error)coder  )r  rf  r*  r   r[  r^   r  r%  r   ry   rf   ro   rn  ) rQ  r  r;  r   r   r  r8   r4   r   r  r<   r=   r?   r6   r@   r=  rA   rB   r7   r^   r>   rq  r5   rk  rc  r  r   rn  rO   r:   r;   r9   )r(  r  rC  rD  r3  rE  rg  rj  rp  rf  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedrF  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedrt  error_eventresponse_failedrJ  r   rP  r   r
  r   s                           rS   stream_responsez0Serve.generate_response.<locals>.stream_response.  s     J M5<<55a8>>@@!
 =E #6?PQFOLMY!YY[
 $8+$3%":,/#-'3%(WW^%<&(89) !,/GG4I5,Q$*!$!4$ $  1$//0@AA'>/$3%":,/#-,3%(WW^%<&(89) !,/GG4I5,Q$*!$!4($$  1$//0DEE .J5$3!-.!*.Y}[fpr	.*  1$//0JKK /L6":,/$3!-"/+RUWX/+  1$//0KLL & %XF5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G "(G3).J&(G$9O%A*.zl(;0?-9.;&,)+:6 ,q0O"&";";<V"WW "9O%A*.zl(;0?-9.;&,)+:6 ,q0O"&";";<V"WWK%XP -B4":,/$3!-"# -)  1$//0IJJ .J5":,/$3!-"/+E^EcEcqst.*  1$"//0JKK -H4$3!-.!*.&*(!;!@!@ A$&	-)  1$!//0IJJ &<-$3%":,/#-*3%(WW^%<&(89 9 > >?) ,/GG4I5,Q$*!$!4&"$  1$//0BCCJ I  !AA#a&JK0 $3F
  1$//<<"5*$3%":,/#-'3%(WW^%<&(89!) ,1$*!$!4+!/$'F#,  1$//@@C!AH sE   AR=G5O E7O <R=	R%CR R(  R%%R( (R::R=)r?  r>  r@  r  rO   r  r$  rE  	TypeErrorr   r  r   r   rQ  r  r;  r   r   rI   rN  r=  rO  rP  r   )r   r   r  r   r  r%  rU  rI   r=  rV  r  rJ  r   rP  r
  s   ``         @@@@rS   r  zServe.generate_response  sj    !% 7 7G E2dooE/889NOyc'lC(M[_bMbxC4GHIhjFMM6c'lCDGd+$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd+M[_bMbxC4GHIhjFMM#g,'CDD..$tQU / 

 5<<(WW3W=
 #u||11!4::<<"'2 3

 >c[`[r[rs$-?((779Gk"((,w6 $ 2 2 -f5+!2'+,
l	 l	\ 2J??rR   c                    | j                  |d         }|| j                  k7  }|| _        | j                  |      \  }}t        |d   t              r'd|v r	d|d   dgng }|j                  d|d   d       nt        |d   t              r8d|v r.|d   d   d   dk7  rd|d   dg|d   }nY|d   }|d   |d   d	<   nH|d   }nBt        |d   t              r$d|v r	d|d   dgng }|j                  |d          nt        d
      |j                  |ddd      d   }|j                  |j                        }|j                  dd      }d}d|j                  j                  d   j                         v rd}t!        ||j"                        }	d}
| j%                  |      r:|s8| j&                  j)                         }|j*                  d   |kD  r| j&                  }
|j-                  |t/        |      |	d|
      }|j0                  | _        |j3                  |j4                  |      d   }t7        j6                         }t9        d| dddt;        d|g       gg       }t=        d| |d||j                  d      d d!d"ii|gd#g |j                  d$d      d%|j                  d&      '      }|j?                  d(      S ))a  
        Generates an OpenAI Response in non-streaming mode (single JSON payload).

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `dict`: The OpenAI `Response` serialized as a dict.
        r   rZ  r[  r\  r  r_   r   r  r  r]  Tr  r^  r  rj   r"  r#  Fr'  Nr   )r  r_  rI   r)  r*  r%  ri  r  rr  r  rl  rm  rs  rb  rd  r  r^   r4  ry   re  ro   rt  r  ) r?  r>  r@  r  rO   r  r$  rE  r`  r   r  r   r   rQ  r  r;  r   rI   rN  r=  rO  rP  r0  r   r*  batch_decode	sequencesr  r?   r@   r4   rS  )r   r   rP  r  r   r   r  r
  r%  rI   r=  rV  r2  	full_textrf  response_output_itemr  s                    rS   r  z%Serve.generate_response_non_streaming  s    !% 7 7G E2dooE/889NOyc'lC(M[_bMbxC4GHIhjFMM6c'lCDGd+$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd+M[_bMbxC4GHIhjFMM#g,'DEE..$tQU / 

 5<<(WW3W=
 #u||11!4::<<"'=c[`[r[rs$-?((779G||B') $ 2 2..*62/$() ) 
 -<< **?+D+DZm*nopq	YY[
4j\"']XZ[\ 
 &zl#!'0VV,-() #(=u EWWZ(
 ",,$,??rR   c                 (  
 t               st        d      | j                  |d         }| j                  |      \  t	        j
                  dd      }t        |j                        }j                  j                  }t        j                  |d         }t        j                  ||d      \  }} ||d	      j                  j                        

d
   j                  j                         
d
<   ||dd
fd}	 |	       S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr$  r'  rZ   )srmonor  )sampling_rater  input_features)r(  rI   r)  c               3      K    j                   di } j                  | j                  d      d   }t        |      }|j	                  d        y w)NTr  r   )r^   r  rQ   )r0  r  r  r)   r  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorrJ  s      rS   _generate_transcriptionz=Serve.generate_transcription.<locals>._generate_transcription  sg     0K00U<UCTUM!0!=!=m>U>Uko!=!pqr!s)/ABM"222EFGs   AA)r   r7  r?  load_audio_model_and_processorr   r  r   rI   feature_extractorr  ior   librosaloadr  r   r   )r   r   rP  rU  rI   model_sampling_rateaudio_bytesaudio_array_r  r  r  r  rJ  s             @@@@rS   r   zServe.generate_transcriptionx  s"    $%o  !% 7 7G E'+'J'JK`'a$_2%%4T
 >)F)F

 .??MMjjV- k6IPTUQ&{BUfjknn
 *66F)G)J)J;K\K\)]%& ,!2'+
	H '((rR   c                 >   |j                  d      xs |j                  d      }d}| j                  d}n`t        | j                        t        |      k\  rd}n<t        t        | j                              D ]  }| j                  |   ||   k7  sd} n || _        |S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  rZ  TF)r   r<  lenrange)r   r   r  req_continues_last_messagesis        rS   rN  zServe.is_continuation  s     77:&:#'''*:&*# %*/'##$H5*/' 3t1123 %%a(HQK727/
 &**rR   c                     | j                   dk(  rt        ddd      }n| j                   dk(  rt        d      }nd}|t        j                  d|        |S )	z
        Returns the quantization config for the given CLI arguments.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        zbnb-4bitTnf4)load_in_4bitbnb_4bit_quant_typebnb_4bit_use_double_quantzbnb-8bit)load_in_8bitNz0Quantization applied with the following config: )r   r   r   r   )r   quantization_configs     rS   get_quantization_configzServe.get_quantization_config  si     
*"4!$)*.#
 *,"4$"G"&*KKJK^J_`a""rR   model_idc                 H    | j                   | j                   }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        r  z@main)r   )r   r  s     rS   r?  zServe.process_model_name  s4     '''H(?O5!!rR   rP  c                    ddl }ddlm}m} t        j                  d|        d|v r|j                  dd      \  }}n|d}}	 |j                  ||| j                        }| j                  d
v r| j                  nt        || j                        }| j                         }	|| j                  || j                  | j                  |	d}
 |j                  |fi |
}t        t        |j                   d         } |j                  |fi |
}|j"                  j$                  du xr |j"                  j&                  dk(  }|j"                  j$                  duxr |j"                  j$                  dk  }|s|rd|j"                  _        t        j                  d|        ||fS # t        $ r@ 	 t        j                  ||| j                        }n# t        $ r t        d	      w xY wY w xY w)a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        r   N)
AutoConfigAutoProcessorzLoading r  r  r  )revisionr   zBFailed to load processor with `AutoProcessor` and `AutoTokenizer`.)re  N)r  r   r   
device_mapr   r     r  zLoaded model )r   r   r  r  r   r   r  from_pretrainedr   OSErrorr   r   rR  r  r   r   r  rI   r   
max_length)r   rP  r   r  r  r  r  data_processorr   r  model_kwargsrQ  architecturer   has_default_max_lengthhas_short_max_new_tokenss                   rS   _load_model_and_data_processorz$Serve._load_model_and_data_processor  s    	:h4567''!6!<!<S!!DHh!6hH	d*::!"&"8"8 ; N #jjN:

tzz@Z"::< !#'#;#;++!%!7!7#6
 ,++HEE|V-A-A!-DE,,,XFF ##22d:gu?V?V?a?aeg?g 	 ##22$>p5CZCZCiCilpCp 	! "%=59E##2m$9#:;<n$$I  	dd!.!>!>%&*&<&<"
  dbccd 	ds*   F# #	G,-"GG,G%%G,+G,)r   r    c                 x   || j                   vs| j                   |   j                         r=| j                  |      \  }}t        || j                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   r   r   r  r   r   r   r   r   )r   rP  r   r   s       rS   r@  zServe.load_model_and_processor4  s     !(:(::d>P>PQf>g>r>r>t#BBCXYE98B $ 2 2#9D45 i	 45AAC&&'<=CCE**+@AKKIirR   )r   r!   c                 x   || j                   vs| j                   |   j                         r=| j                  |      \  }}t        || j                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   rP  r  r  s       rS   r  z$Serve.load_audio_model_and_processorO  s     !(:(::d>P>PQf>g>r>r>t+/+N+NOd+e(K8B $ 2 2)9D45 O++	 45AAC,,-BCIIK"001FGQQOO++rR   )Fre  re  FNN	localhosti@  r   r   NFFNF)r   NNNNNNNr   )3rK   rL   rM   r
   r]   typerOptionrO   r   r   rL  rd  rE  r   rE   rk  rv  r  r  r  r$  r1   r   r   r/   r  staticmethodrD   r  r   r  r'  r(   r'   r  r   r	  r  r  r   r  r  r   rN  r   r  r?  r  tupler@  r  rQ   rR   rS   r   r   h  s|         ZeUY    ns  qF&,%,,$^__
F
 ELL~
F $JELL x
F" %,%,,$XYY
#F( '$JELL Z
)F4  $JELLabd
5F< \U\\/UVVW=F> \U\\/PQQR?F@ !#eff
AFF #]^^
GFL  $J*]^^
MFR ELL L
SF^ $D,%,,<i*j$jk_F` $JELL X
aFl  ,%,,d1bcc
mFr 
sFP	)// / 	/
 /b
 

 

d 
 " $(7;-19=88 t8 Tz	8
 Dj8 Tz8 ,-48 $d*8 568 
8t G$7)$C G G G  -!#* -!T#s(^8L -!  -!^TKt TK TKQbeqQq TKl "3   ( + x +  + ZCGD CG5F5U CGJq@T q@iT4.H q@f	X@4 X@D X@t.)$ .)9S$_3M .)`+4 +D +<#);d)B #."3 "3 ""D%C D%L %( 	;	< 6,C ,ERuLv ,rR   r   a  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.

Models will be loaded and unloaded automatically based on usage and a timeout.


The server will expose the following endpoints:
    - POST /v1/chat/completions: Generates chat completions.
    - POST /v1/responses: Generates responses.
    - POST /v1/audio/transcriptions: Generates transcriptions from audio.
    - GET /v1/models: Lists available models for 3rd party tools.

Requires FastAPI and Uvicorn to be installed.
__main__)rT  r  r   enumr   r  r   r  r  r   r  r1  collections.abcr   r   
contextlibr   	functoolsr   r   r   typingr	   r
   r   r   r   r  huggingface_hubr   tokenizers.decodersr   r   r   r   r   r   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   utilsr   r   r    r!   r  r"   r  PILr#   r6  rH  rC  r$   r%   fastapi.middleware.corsr&   fastapi.responsesr'   r(    openai.types.audio.transcriptionr)   .openai.types.audio.transcription_create_paramsr*   openai.types.chatr+   r,   r-   !openai.types.chat.chat_completionr.   'openai.types.chat.chat_completion_chunkr/   r0   r1   r2   r  *openai.types.chat.completion_create_paramsr3   openai.types.responsesr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   -openai.types.responses.response_create_paramsrC   pydanticrD   rE   rF   rH   rW   rY   ry  r}  r  rz  r~  r  r8  rK   r   r?  r$  rl  rM  r0  r   r   r   Enumr   rE  r   r   r   r   rN   rY  rQ   rR   rS   <module>r     sL       	 	  	     / *    G G  * ,   e e     K  k 4 6k;O;QkViVk   .6A>\cc8  [    " \@@4QY^ 6U]b 0MUZ  %%NO&'RS)*OP %!.# 
		H	%
   !!2!7!7!9: !*tyy 8	8-8 	8v 1@ 1@h~, ~,D0 zGE rR   