#!/usr/bin/env python3
"""
Genesis Validation Gate 2 - Output Quality Validation
======================================================
Validates output quality, coherence, and task completion.
Intelligence layer enforcing P5 (Consensus), P6 (Confidence), P3 (Risk).

PM-043: Validation Gate 2 - Output Quality Enhancement
- Validates output quality and coherence
- Rejects low-quality outputs
- Provides improvement suggestions
"""

import re
import logging
from typing import Dict, Any, List, Tuple, Optional
from datetime import datetime
from dataclasses import dataclass, asdict

logger = logging.getLogger("ValidationGate2")
logging.basicConfig(level=logging.INFO)


@dataclass
class QualityResult:
    """Result of quality validation."""
    valid: bool
    score: float
    checks: Dict[str, bool]
    confidence: float
    risk_level: float
    suggestions: List[str]
    timestamp: str

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


class OutputQualityGate:
    """
    Gate 2: Output Quality Layer
    Validates output quality, coherence, and relevance.
    Enforces P5 (Consensus), P6 (Confidence), P3 (Risk).
    """

    # Quality thresholds
    THRESHOLDS = {
        "min_confidence": 0.7,
        "max_risk": 0.4,
        "min_length": 10,
        "max_length": 100000,
        "min_coherence": 0.6
    }

    # High-stakes keywords for risk assessment
    HIGH_STAKES_KEYWORDS = [
        "financial", "medical", "legal", "safety", "security",
        "delete", "remove", "destroy", "payment", "password",
        "credential", "emergency", "critical", "urgent"
    ]

    # Quality indicators
    QUALITY_INDICATORS = {
        "positive": [
            "complete", "successful", "verified", "confirmed",
            "implemented", "tested", "validated"
        ],
        "negative": [
            "error", "failed", "unable", "cannot", "impossible",
            "unsure", "maybe", "possibly", "might"
        ]
    }

    def __init__(self, memory_stores=None, consensus_threshold: float = 0.7):
        """
        Initialize output quality gate.

        Args:
            memory_stores: Optional memory backend reference
            consensus_threshold: Threshold for consensus validation
        """
        self.memory = memory_stores
        self.consensus_threshold = consensus_threshold
        self.validation_log: List[QualityResult] = []
        self.worker_history: Dict[str, List[float]] = {}

        logger.info("Validation Gate 2 (Output Quality) initialized")

    def validate(self, output: str, task_description: str,
                 worker_id: str = None, metadata: Dict[str, Any] = None) -> Tuple[float, Dict[str, bool]]:
        """
        Validate output quality and coherence.

        Args:
            output: The output to validate
            task_description: Original task description
            worker_id: ID of the worker that produced the output
            metadata: Additional validation context

        Returns:
            Tuple of (score, checks_dict)
        """
        metadata = metadata or {}
        suggestions = []
        checks = {}

        # P5: Consensus check (simplified for single-agent)
        checks["consensus_reached"] = self._check_consensus(output, metadata)

        # P6: Dynamic confidence scoring
        confidence = self._calculate_confidence(output, metadata, worker_id)
        checks["confidence_high"] = confidence >= self.THRESHOLDS["min_confidence"]

        if not checks["confidence_high"]:
            suggestions.append(f"Confidence {confidence:.2f} below threshold. Add more verification.")

        # P3: Risk assessment
        risk_level = self._assess_risk(output, task_description)
        checks["risk_acceptable"] = risk_level <= self.THRESHOLDS["max_risk"]

        if not checks["risk_acceptable"]:
            suggestions.append(f"Risk level {risk_level:.2f} is high. Review for safety concerns.")

        # Quality: Coherence check
        coherence = self._assess_coherence(output, task_description)
        checks["output_coherent"] = coherence >= self.THRESHOLDS["min_coherence"]

        if not checks["output_coherent"]:
            suggestions.append("Output may not fully address the task. Review for relevance.")

        # Quality: Completeness check
        checks["output_complete"] = self._check_completeness(output, task_description)

        if not checks["output_complete"]:
            suggestions.append("Output appears incomplete. Ensure all aspects are addressed.")

        # Quality: Length validation
        checks["length_valid"] = self._validate_length(output, suggestions)

        # Quality: No hallucination indicators
        checks["no_hallucinations"] = self._check_hallucinations(output)

        if not checks["no_hallucinations"]:
            suggestions.append("Output contains uncertainty indicators. Verify factual accuracy.")

        # Quality: Professional tone
        checks["professional_tone"] = self._check_tone(output)

        # Quality: Actionable content
        checks["actionable_content"] = self._check_actionable(output, task_description)

        if not checks["actionable_content"]:
            suggestions.append("Consider adding more actionable guidance.")

        # Calculate score
        score = sum(checks.values()) / len(checks)

        # Create result
        result = QualityResult(
            valid=score >= 0.75 and checks["risk_acceptable"],
            score=score,
            checks=checks,
            confidence=confidence,
            risk_level=risk_level,
            suggestions=suggestions,
            timestamp=datetime.utcnow().isoformat()
        )

        # Update worker history
        if worker_id:
            if worker_id not in self.worker_history:
                self.worker_history[worker_id] = []
            self.worker_history[worker_id].append(score)

        # Log result
        self._log_validation(result, worker_id)

        return score, checks

    def _check_consensus(self, output: str, metadata: Dict[str, Any]) -> bool:
        """
        Check for consensus among multiple agents.
        In single-agent mode, checks for internal consistency.
        """
        # If multiple outputs provided for consensus
        if "other_outputs" in metadata:
            other_outputs = metadata["other_outputs"]
            if not other_outputs:
                return True

            # Calculate similarity with other outputs
            similarities = [
                self._text_similarity(output, other)
                for other in other_outputs
            ]
            avg_similarity = sum(similarities) / len(similarities)
            return avg_similarity >= self.consensus_threshold

        return True  # Single output, consensus by default

    def _calculate_confidence(self, output: str, metadata: Dict[str, Any],
                              worker_id: str = None) -> float:
        """
        Calculate dynamic confidence score.
        Based on output quality, worker history, and metadata.
        """
        base_confidence = 0.5

        # Worker trust adjustment
        if worker_id and worker_id in self.worker_history:
            history = self.worker_history[worker_id]
            if len(history) >= 3:
                recent_avg = sum(history[-5:]) / min(len(history), 5)
                base_confidence += (recent_avg - 0.5) * 0.3

        # Output length bonus (reasonable length indicates effort)
        length = len(output)
        if 100 <= length <= 5000:
            base_confidence += 0.15
        elif length > 5000:
            base_confidence += 0.1

        # Positive indicators bonus
        positive_count = sum(
            1 for word in self.QUALITY_INDICATORS["positive"]
            if word.lower() in output.lower()
        )
        base_confidence += min(positive_count * 0.05, 0.15)

        # Negative indicators penalty
        negative_count = sum(
            1 for word in self.QUALITY_INDICATORS["negative"]
            if word.lower() in output.lower()
        )
        base_confidence -= min(negative_count * 0.05, 0.2)

        # Structure bonus (has sections/formatting)
        if "\n\n" in output or "- " in output or "1." in output:
            base_confidence += 0.1

        return max(0.0, min(1.0, base_confidence))

    def _assess_risk(self, output: str, task_description: str) -> float:
        """
        Assess risk level of the output.
        Higher risk for high-stakes domains.
        """
        risk = 0.1  # Base risk

        combined_text = f"{output} {task_description}".lower()

        # Check for high-stakes keywords
        for keyword in self.HIGH_STAKES_KEYWORDS:
            if keyword in combined_text:
                risk += 0.1

        # Check for code execution patterns
        if re.search(r'exec\(|eval\(|system\(|subprocess', output):
            risk += 0.3

        # Check for external requests
        if re.search(r'http[s]?://|curl |wget |api\.', output, re.IGNORECASE):
            risk += 0.1

        # Check for file operations
        if re.search(r'delete|remove|rm |unlink|truncate', output, re.IGNORECASE):
            risk += 0.2

        return min(risk, 1.0)

    def _assess_coherence(self, output: str, task_description: str) -> float:
        """
        Assess coherence between output and task.
        Basic keyword overlap and structure check.
        """
        if not output or not task_description:
            return 0.0

        # Extract keywords from task
        task_words = set(re.findall(r'\b[a-zA-Z]{4,}\b', task_description.lower()))

        # Check how many task keywords appear in output
        output_lower = output.lower()
        matches = sum(1 for word in task_words if word in output_lower)

        if not task_words:
            return 0.7  # Default for simple tasks

        return min(matches / len(task_words), 1.0)

    def _check_completeness(self, output: str, task_description: str) -> bool:
        """Check if output appears to be complete."""
        # Very short outputs are likely incomplete
        if len(output) < 50:
            return False

        # Check for truncation indicators
        truncation_indicators = [
            "...",
            "[truncated]",
            "[continued]",
            "to be continued"
        ]
        for indicator in truncation_indicators:
            if indicator.lower() in output.lower():
                return False

        # Check if output ends mid-sentence
        stripped = output.strip()
        if stripped and stripped[-1] not in ".!?\"')]:;":
            # Allow for code blocks and lists
            if not stripped.endswith("}") and not stripped.endswith("`"):
                return False

        return True

    def _validate_length(self, output: str, suggestions: List[str]) -> bool:
        """Validate output length is within acceptable range."""
        length = len(output)

        if length < self.THRESHOLDS["min_length"]:
            suggestions.append("Output is very short. Consider adding more detail.")
            return False

        if length > self.THRESHOLDS["max_length"]:
            suggestions.append("Output is very long. Consider condensing.")
            return False

        return True

    def _check_hallucinations(self, output: str) -> bool:
        """Check for hallucination indicators."""
        hallucination_patterns = [
            r'I apologize, but I cannot',
            r'As an AI',
            r'I don\'t have access to',
            r'\[citation needed\]',
            r'I\'m not sure (if|whether)',
            r'I cannot verify',
            r'hypothetically speaking'
        ]

        for pattern in hallucination_patterns:
            if re.search(pattern, output, re.IGNORECASE):
                return False

        return True

    def _check_tone(self, output: str) -> bool:
        """Check for professional tone."""
        unprofessional_patterns = [
            r'\b(lol|lmao|omg|wtf)\b',
            r'!!!+',
            r'\?\?\?+',
            r'\b(stupid|dumb|idiot)\b'
        ]

        for pattern in unprofessional_patterns:
            if re.search(pattern, output, re.IGNORECASE):
                return False

        return True

    def _check_actionable(self, output: str, task_description: str) -> bool:
        """Check if output contains actionable content."""
        # For informational queries, any substantive response is actionable
        info_keywords = ["what is", "explain", "describe", "how does"]
        for keyword in info_keywords:
            if keyword in task_description.lower():
                return len(output) > 100

        # For action tasks, look for action words or instructions
        action_patterns = [
            r'\b(do|run|execute|create|build|implement)\b',
            r'\d+\.',  # Numbered steps
            r'- ',     # Bullet points
            r'```',    # Code blocks
            r'should|need to|must|will'
        ]

        for pattern in action_patterns:
            if re.search(pattern, output, re.IGNORECASE):
                return True

        return False

    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate simple text similarity using word overlap."""
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())

        if not words1 or not words2:
            return 0.0

        intersection = words1 & words2
        union = words1 | words2

        return len(intersection) / len(union)

    def _log_validation(self, result: QualityResult, worker_id: str = None) -> None:
        """Log validation result."""
        self.validation_log.append(result)

        log_level = logging.WARNING if not result.valid else logging.INFO
        logger.log(
            log_level,
            f"Gate2 Validation [worker={worker_id}]: valid={result.valid}, "
            f"score={result.score:.2f}, confidence={result.confidence:.2f}, "
            f"risk={result.risk_level:.2f}"
        )

    def get_improvement_suggestions(self, output: str,
                                    task_description: str) -> List[str]:
        """Get improvement suggestions without full validation."""
        _, checks = self.validate(output, task_description)

        # Return last validation's suggestions
        if self.validation_log:
            return self.validation_log[-1].suggestions
        return []

    def get_validation_history(self, limit: int = 100) -> List[Dict[str, Any]]:
        """Get recent validation results."""
        return [r.to_dict() for r in self.validation_log[-limit:]]

    def get_metrics(self) -> Dict[str, Any]:
        """Get validation gate metrics."""
        if not self.validation_log:
            return {"total_validations": 0}

        valid_count = sum(1 for r in self.validation_log if r.valid)
        total_count = len(self.validation_log)

        return {
            "total_validations": total_count,
            "valid_count": valid_count,
            "invalid_count": total_count - valid_count,
            "pass_rate": valid_count / total_count if total_count > 0 else 0,
            "avg_score": sum(r.score for r in self.validation_log) / total_count,
            "avg_confidence": sum(r.confidence for r in self.validation_log) / total_count,
            "avg_risk": sum(r.risk_level for r in self.validation_log) / total_count,
            "workers_tracked": len(self.worker_history)
        }


# Backward compatibility alias
IntelligenceGate = OutputQualityGate


if __name__ == "__main__":
    # Self-test
    gate = OutputQualityGate()

    print("\n=== Validation Gate 2 (Output Quality) Test ===")

    # Test high-quality output
    good_output = """
    I've completed the analysis of the lead data. Here are the findings:

    1. Total leads processed: 150
    2. Qualified leads: 87 (58%)
    3. Conversion rate: 23%

    Recommendations:
    - Focus on the electrician segment showing highest conversion
    - Implement SMS follow-up for leads not responding to email
    - Consider A/B testing the landing page copy

    All data has been validated and synced to GHL.
    """
    score, checks = gate.validate(
        good_output,
        "Analyze the lead data and provide recommendations",
        worker_id="worker_1"
    )
    print(f"Good output: score={score:.2f}, checks={checks}")

    # Test low-quality output
    poor_output = "I'm not sure about this. Maybe try something else?"
    score, checks = gate.validate(
        poor_output,
        "Implement the new feature",
        worker_id="worker_2"
    )
    print(f"Poor output: score={score:.2f}, checks={checks}")

    # Test high-risk output
    risky_output = "I'll delete all the files and remove the database tables to clean up."
    score, checks = gate.validate(
        risky_output,
        "Clean up the temporary files",
        worker_id="worker_3"
    )
    print(f"Risky output: score={score:.2f}, checks={checks}")

    # Get suggestions
    print(f"\nSuggestions for poor output: {gate.get_improvement_suggestions(poor_output, 'Implement feature')}")

    # Metrics
    print(f"\nGate Metrics: {gate.get_metrics()}")