
    Li!                       d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	m	Z	m
Z
mZ ddlmZmZmZ ddlmZ ddlmZmZ ddlmZmZ  ej0                  e      Zdd	Zddd
ZddZedk(  r ej<                  ej>                  ej@                  d        e       Z!e!jE                         Z# ejH                   ee#jJ                              Z& e' ejP                  e&de)             yy)u   
Cron Scheduler — KB Ingestion Pipeline Module 11
=================================================
Runs nightly re-ingestion for all active platforms.

Usage:
    python3 -m core.kb.cron
    python3 -m core.kb.cron --dry-run
    )annotationsN)datetimetimezone	timedelta)AnyDictOptional)ingest_platform)list_platformsget_platform)get_ingestion_historyget_connectionc                p   t        || d      }|sy|d   j                  d      }|yt        j                  t        j
                        }t        |t              rt        j                  |      }|j                   |j                  t        j
                        }||z
  j                         dz  }||k\  S )uS  
    Check if a platform needs re-ingestion based on last ingestion time.

    Parameters
    ----------
    platform_name:
        The platform key (e.g. "telnyx", "hubspot").
    refresh_hours:
        Minimum hours that must have elapsed since the last ingestion.
        If hours_since_last_run >= refresh_hours the platform is stale.
    conn:
        Active psycopg2 connection passed to get_ingestion_history().

    Returns
    -------
    bool
        True  — platform should be re-ingested (never run, or stale).
        False — platform was ingested recently enough, skip it.
       )limitTr   
started_at)tzinfoi  )r   getr   nowr   utc
isinstancestrfromisoformatr   replacetotal_seconds)platform_namerefresh_hoursconnhistorylast_runr   hours_sinces          %/mnt/e/genesis-system/core/kb/cron.py_should_refreshr#   $   s    ( $D-qAGqz~~l+H
,,x||
$C (C ))(3 ##8<<#8>002T9K-''    c           
       K   t        j                          }t        j                  t        j                        j                         }|dddi ddd}t               }t        |      |d<   d}	 t               }|D ]&  }t        |      }|,t        j                  d|       |dxx   dz  cc<   d	|d
   |<   =t        ||j                  |      }|s,t        j                  d|       |dxx   dz  cc<   d	|d
   |<   | r,t        j                  d|       |dxx   dz  cc<   d|d
   |<   t        j                  d|       	 t        |       d{   }	|dxx   dz  cc<   |	|d
   |<   t        j                  d||	j!                  dd      |	j!                  dd             ) 	 |	 |j)                          t+        t        j                          |z
  d      |d<   |S 7 # t"        $ rE}
|dxx   dz  cc<   t%        |
      |d}||d
   |<   t        j'                  d||
       Y d}
~
d}
~
ww xY w# t"        $ r Y w xY w# |!	 |j)                          w # t"        $ r Y w w xY ww xY ww)u  
    Run nightly re-ingestion for all registered platforms.

    For each platform:
    1. Check last ingestion time from PG via get_ingestion_history().
    2. If hours since last ingestion >= platform.refresh_hours → ingest.
    3. Otherwise skip (platform is fresh).
    4. Collect stats from each ingestion; aggregate into a summary result.

    Parameters
    ----------
    dry_run:
        When True, check which platforms need refresh but do NOT ingest.
        "platforms_ingested" will count platforms that *would* be ingested,
        and all platform results for those will be "would_ingest".

    Returns
    -------
    dict with keys:
        run_at               — ISO 8601 timestamp of when this run started
        platforms_checked    — total platforms evaluated
        platforms_ingested   — platforms that were (or would be) ingested
        platforms_skipped    — platforms that were up-to-date
        platform_results     — {platform_name: stats_dict | "skipped" | "would_ingest"}
        errors               — count of platforms that raised an exception
        total_duration_seconds — wall-clock duration of the whole run
    r   g        )run_atplatforms_checkedplatforms_ingestedplatforms_skippedplatform_resultserrorstotal_duration_secondsr'   Nu6   Platform '%s' listed but config not found — skippingr)   r   skippedr*   u#   Platform '%s' is fresh — skippingu&   DRY RUN — would ingest platform '%s'r(   would_ingestzIngesting platform '%s'z+Platform '%s' ingested: %d pages, %d chunkspages_fetchedchunks_createdr+   )errorplatformz"Platform '%s' ingestion failed: %s   r,   )timer   r   r   r   	isoformatr   lenr   r   loggerwarningr#   r   infor
   r   	Exceptionr   r1   closeround)dry_runt_startr&   resultplatform_namesr   nameconfigshouldstatsexc
error_infos               r"   nightly_ingestionrG   S   s    8 iikG\\(,,'113F "%F $%N"%n"5F D2" '	ND!$'F~WY]^*+q0+3<)*40$T6+?+?FFA4H*+q0+3<)*40 DdK+,1,3A)*40 KK148N-d33+,1,38)*40AIIoq1II.2	;'	NV 

 (-TYY[7-BA'FF#$M1 4  Nx A% '*3xTB
3=)*40A4MM	N   

  s   A*I'-CH?  GGAG H? $I''H0 7&I'G	H-(:H("H? (H--H? 0	H<9I';H<<I'?I$II$	I I$I  I$$I'c                 \    t        j                  dd      } | j                  ddd       | S )Nzpython3 -m core.kb.cronu7   KB Nightly Cron Scheduler — re-ingest stale platforms)progdescriptionz	--dry-run
store_truez=Check which platforms need refresh without actually ingesting)actionhelp)argparseArgumentParseradd_argument)parsers    r"   _build_parserrR      s=    $$&MF L  
 Mr$   __main__u2   %(asctime)s %(levelname)s %(name)s — %(message)s)levelstreamformat)r=      )indentdefault)r   r   r   intreturnbool)F)r=   r\   r[   zDict[str, Any])r[   zargparse.ArgumentParser)*__doc__
__future__r   rN   asynciojsonloggingsysr4   r   r   r   typingr   r   r	   core.kb.orchestratorr
   core.kb.platform_registryr   r   core.kb.pg_storer   r   	getLogger__name__r7   r#   rG   rR   basicConfigINFOstderrrQ   
parse_argsargsrunr=   
run_resultprintdumpsr    r$   r"   <module>rs      s    #     
  2 2 & & 1 B B			8	$'(^dX
 zGllzzC
 _FD.t||DEJ	*$**Z3
78 r$   