#!/usr/bin/env python3
"""
Standalone Wavecaster System
===========================
A self-contained wavecaster system that uses our enhanced tokenizer,
knowledge base, and training data to provide advanced AI capabilities
without requiring external LLM connections.
"""

import json
import asyncio
import time
from pathlib import Path
from typing import Dict, List, Any, Optional
from datetime import datetime

# Import our systems
try:
    from enhanced_tokenizer_minimal import MinimalEnhancedTokenizer
    ENHANCED_TOKENIZER_AVAILABLE = True
except ImportError:
    ENHANCED_TOKENIZER_AVAILABLE = False
    print("⚠️  Enhanced tokenizer not available")

try:
    from kgirl.distributed_knowledge_base import DistributedKnowledgeBase, KnowledgeBaseConfig
    KNOWLEDGE_BASE_AVAILABLE = True
except ImportError:
    KNOWLEDGE_BASE_AVAILABLE = False
    print("⚠️  Distributed knowledge base not available")

class StandaloneWavecasterSystem:
    """Standalone wavecaster system with integrated AI capabilities."""
    
    def __init__(self):
        self.enhanced_tokenizer = None
        self.knowledge_base = None
        self.training_data = []
        self.response_templates = {}
        
        self._initialize_components()
    
    def _initialize_components(self):
        """Initialize all wavecaster components."""
        print("🚀 Initializing Standalone Wavecaster System...")
        
        # Initialize enhanced tokenizer
        if ENHANCED_TOKENIZER_AVAILABLE:
            try:
                self.enhanced_tokenizer = MinimalEnhancedTokenizer()
                print("✅ Enhanced Tokenizer initialized")
            except Exception as e:
                print(f"❌ Enhanced Tokenizer failed: {e}")
        
        # Initialize knowledge base
        if KNOWLEDGE_BASE_AVAILABLE:
            try:
                config = KnowledgeBaseConfig(
                    db_path="second_llm_knowledge.db",
                    faiss_index_path="second_llm_faiss_index",
                    embedding_dimension=384
                )
                self.knowledge_base = DistributedKnowledgeBase(config)
                print("✅ Distributed Knowledge Base initialized")
            except Exception as e:
                print(f"❌ Knowledge Base failed: {e}")
        
        # Load training data for context
        self._load_training_data()
        
        # Initialize response templates
        self._initialize_response_templates()
    
    def _load_training_data(self):
        """Load training data for context generation."""
        print("📁 Loading training data for context...")
        
        training_files = [
            "second_llm_training_prompts.jsonl",
            "processed_training_data.jsonl",
            "comprehensive_training_data.jsonl"
        ]
        
        for file_path in training_files:
            if Path(file_path).exists():
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        for line in f:
                            line = line.strip()
                            if line:
                                data = json.loads(line)
                                self.training_data.append(data)
                except Exception as e:
                    print(f"⚠️  Error loading {file_path}: {e}")
        
        print(f"✅ Loaded {len(self.training_data)} training entries")
    
    def _initialize_response_templates(self):
        """Initialize response templates for different content types."""
        self.response_templates = {
            "academic": {
                "intro": "Based on academic research and analysis:",
                "structure": ["Introduction", "Key Concepts", "Analysis", "Implications", "Conclusion"],
                "style": "formal and analytical"
            },
            "code": {
                "intro": "From a technical and programming perspective:",
                "structure": ["Problem Analysis", "Solution Approach", "Code Example", "Explanation", "Best Practices"],
                "style": "technical and practical"
            },
            "mathematical": {
                "intro": "Mathematical analysis reveals:",
                "structure": ["Problem Statement", "Mathematical Framework", "Solution Steps", "Verification", "Applications"],
                "style": "precise and logical"
            },
            "general": {
                "intro": "Analysis shows that:",
                "structure": ["Overview", "Key Points", "Details", "Summary"],
                "style": "clear and comprehensive"
            }
        }
    
    async def initialize_knowledge_base(self):
        """Initialize the knowledge base."""
        if self.knowledge_base:
            try:
                await self.knowledge_base.initialize()
                print("✅ Knowledge base initialized")
                return True
            except Exception as e:
                print(f"❌ Knowledge base initialization failed: {e}")
                return False
        return False
    
    async def analyze_query_with_tokenizer(self, query: str) -> Dict[str, Any]:
        """Analyze query using enhanced tokenizer."""
        if not self.enhanced_tokenizer:
            return {"content_type": "general", "entities": [], "math_expressions": []}
        
        try:
            tokenizer_result = await self.enhanced_tokenizer.tokenize(query)
            return {
                "content_type": tokenizer_result.semantic_features.get("content_type", "general"),
                "entities": tokenizer_result.entities,
                "math_expressions": tokenizer_result.math_expressions,
                "fractal_features": tokenizer_result.fractal_features,
                "complexity_score": tokenizer_result.semantic_features.get("avg_word_length", 0),
                "processing_time": tokenizer_result.processing_time,
                "tokens": tokenizer_result.token_count
            }
        except Exception as e:
            print(f"⚠️  Tokenizer analysis failed: {e}")
            return {"content_type": "general", "entities": [], "math_expressions": []}
    
    async def search_relevant_knowledge(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        """Search for relevant knowledge context."""
        if not self.knowledge_base:
            return []
        
        try:
            # Create query embedding
            if self.enhanced_tokenizer:
                tokenizer_result = await self.enhanced_tokenizer.tokenize(query)
                query_embedding = tokenizer_result.embeddings
            else:
                import numpy as np
                query_embedding = np.random.randn(384)
            
            # Search knowledge base
            knowledge_nodes = await self.knowledge_base.search_knowledge(
                query=query,
                query_embedding=query_embedding,
                k=k
            )
            
            # Convert to context format
            context = []
            for node in knowledge_nodes:
                context.append({
                    "content": node.content,
                    "source": node.source,
                    "coherence_score": node.coherence_score,
                    "metadata": node.metadata
                })
            
            return context
            
        except Exception as e:
            print(f"⚠️  Knowledge search failed: {e}")
            return []
    
    def find_relevant_training_data(self, query: str, content_type: str, limit: int = 3) -> List[Dict[str, Any]]:
        """Find relevant training data based on query and content type."""
        relevant_data = []
        
        for entry in self.training_data:
            # Check content type match
            entry_content_type = entry.get("category", "general")
            if content_type in entry_content_type or entry_content_type in content_type:
                # Check content similarity (simple keyword matching)
                content = entry.get("content", entry.get("completion", ""))
                if any(word.lower() in content.lower() for word in query.split()[:5]):
                    relevant_data.append(entry)
                    if len(relevant_data) >= limit:
                        break
        
        return relevant_data
    
    def generate_structured_response(self, query: str, analysis: Dict[str, Any], 
                                   knowledge_context: List[Dict[str, Any]], 
                                   training_context: List[Dict[str, Any]]) -> str:
        """Generate a structured response based on analysis and context."""
        content_type = analysis.get("content_type", "general")
        template = self.response_templates.get(content_type, self.response_templates["general"])
        
        # Build response
        response_parts = []
        
        # Introduction
        response_parts.append(f"{template['intro']}")
        response_parts.append("")
        
        # Knowledge context integration
        if knowledge_context:
            response_parts.append("**Relevant Knowledge Context:**")
            for i, ctx in enumerate(knowledge_context[:2], 1):
                summary = ctx["content"][:200] + "..." if len(ctx["content"]) > 200 else ctx["content"]
                response_parts.append(f"{i}. {summary}")
            response_parts.append("")
        
        # Training context integration
        if training_context:
            response_parts.append("**Related Training Examples:**")
            for i, ctx in enumerate(training_context[:2], 1):
                content = ctx.get("completion", ctx.get("content", ""))
                summary = content[:150] + "..." if len(content) > 150 else content
                response_parts.append(f"{i}. {summary}")
            response_parts.append("")
        
        # Structured analysis
        response_parts.append("**Analysis:**")
        for section in template["structure"]:
            response_parts.append(f"\n**{section}:**")
            
            if section == "Introduction":
                response_parts.append(f"This query addresses {content_type} concepts related to: {', '.join(query.split()[:3])}.")
                
                if analysis.get("entities"):
                    entities = analysis['entities'][:3]
                    entity_strings = [str(entity) for entity in entities]
                    response_parts.append(f"Key entities identified: {', '.join(entity_strings)}")
                
                if analysis.get("math_expressions"):
                    response_parts.append(f"Mathematical expressions detected: {len(analysis['math_expressions'])}")
            
            elif section == "Key Concepts":
                if knowledge_context:
                    concepts = [ctx["content"].split()[:5] for ctx in knowledge_context[:2]]
                    response_parts.append("Based on the knowledge base, key concepts include:")
                    for concept_set in concepts:
                        response_parts.append(f"- {' '.join(concept_set)}")
                else:
                    response_parts.append("Key concepts derived from query analysis and training data.")
            
            elif section == "Analysis":
                response_parts.append("The analysis reveals several important aspects:")
                response_parts.append(f"- Content complexity: {analysis.get('complexity_score', 0):.2f}")
                response_parts.append(f"- Processing time: {analysis.get('processing_time', 0):.3f}s")
                response_parts.append(f"- Token count: {analysis.get('tokens', 0)}")
                
                if analysis.get("fractal_features"):
                    response_parts.append(f"- Fractal analysis: {analysis['fractal_features']}")
            
            elif section == "Implications":
                response_parts.append("This analysis has several implications:")
                response_parts.append("- Enhanced understanding through multi-modal processing")
                response_parts.append("- Integration of semantic and mathematical analysis")
                response_parts.append("- Knowledge base augmentation for context")
            
            elif section == "Conclusion":
                response_parts.append("In summary, this wavecaster analysis demonstrates:")
                response_parts.append("- Advanced tokenization and semantic understanding")
                response_parts.append("- Knowledge base integration for enhanced context")
                response_parts.append("- Multi-dimensional analysis capabilities")
        
        # Add wavecaster signature
        response_parts.append("\n---")
        response_parts.append("*Generated by Standalone Wavecaster System with Enhanced Tokenizer and Knowledge Base Integration*")
        
        return "\n".join(response_parts)
    
    async def process_wavecaster_query(self, query: str) -> Dict[str, Any]:
        """Process a query through the wavecaster system."""
        print(f"🌊 Processing wavecaster query: {query[:80]}...")
        
        start_time = time.time()
        
        # Step 1: Analyze query with enhanced tokenizer
        analysis = await self.analyze_query_with_tokenizer(query)
        
        # Step 2: Search knowledge base for context
        knowledge_context = await self.search_relevant_knowledge(query)
        
        # Step 3: Find relevant training data
        training_context = self.find_relevant_training_data(query, analysis["content_type"])
        
        # Step 4: Generate structured response
        response = self.generate_structured_response(query, analysis, knowledge_context, training_context)
        
        # Step 5: Create comprehensive result
        result = {
            "query": query,
            "response": response,
            "analysis": analysis,
            "knowledge_context": knowledge_context,
            "training_context": training_context,
            "metadata": {
                "processing_time": time.time() - start_time,
                "content_type": analysis["content_type"],
                "knowledge_nodes_used": len(knowledge_context),
                "training_examples_used": len(training_context),
                "timestamp": datetime.now().isoformat()
            }
        }
        
        print(f"✅ Query processed in {time.time() - start_time:.2f}s")
        return result
    
    async def batch_process_queries(self, queries: List[str]) -> List[Dict[str, Any]]:
        """Process multiple queries in batch."""
        print(f"🌊 Processing {len(queries)} queries in batch...")
        
        results = []
        for i, query in enumerate(queries):
            print(f"  Processing query {i+1}/{len(queries)}")
            result = await self.process_wavecaster_query(query)
            results.append(result)
            
            # Small delay to prevent overwhelming
            await asyncio.sleep(0.1)
        
        print(f"✅ Batch processing completed: {len(results)} results")
        return results
    
    def create_demo_queries(self) -> List[str]:
        """Create demo queries for testing."""
        return [
            "Explain quantum computing principles and applications",
            "Analyze the mathematical foundations of neural networks",
            "Describe the relationship between fractals and AI systems",
            "What are the key components of holographic memory?",
            "How does semantic embedding enhance language processing?",
            "Explain the dimensional entanglement framework",
            "What is the role of matrix neurons in cognitive systems?",
            "Describe the integration between tokenization and knowledge bases",
            "Analyze the emergent properties of distributed AI systems",
            "What are the principles of wavecaster technology?"
        ]
    
    async def run_comprehensive_demo(self) -> Dict[str, Any]:
        """Run a comprehensive demo of the wavecaster system."""
        print("🚀 Running Standalone Wavecaster Demo")
        print("=" * 40)
        
        # Initialize knowledge base
        kb_initialized = await self.initialize_knowledge_base()
        
        # Create demo queries
        demo_queries = self.create_demo_queries()
        
        # Process queries
        print(f"📝 Processing {len(demo_queries)} demo queries...")
        results = await self.batch_process_queries(demo_queries)
        
        # Analyze results
        demo_analysis = {
            "total_queries": len(demo_queries),
            "successful_queries": len(results),
            "average_processing_time": sum(r["metadata"]["processing_time"] for r in results) / len(results),
            "knowledge_base_used": kb_initialized,
            "content_types": {},
            "total_knowledge_context": sum(r["metadata"]["knowledge_nodes_used"] for r in results),
            "total_training_context": sum(r["metadata"]["training_examples_used"] for r in results),
            "results": results
        }
        
        # Analyze content types
        for result in results:
            content_type = result["metadata"]["content_type"]
            demo_analysis["content_types"][content_type] = demo_analysis["content_types"].get(content_type, 0) + 1
        
        # Save demo results
        with open("standalone_wavecaster_demo_results.json", 'w', encoding='utf-8') as f:
            json.dump(demo_analysis, f, indent=2, ensure_ascii=False)
        
        print(f"\n📊 Standalone Wavecaster Demo Summary:")
        print(f"  ✅ Successful queries: {demo_analysis['successful_queries']}")
        print(f"  ⏱️  Average processing time: {demo_analysis['average_processing_time']:.2f}s")
        print(f"  🗄️  Knowledge base: {'✅ Used' if kb_initialized else '❌ Not used'}")
        print(f"  📚 Total knowledge context: {demo_analysis['total_knowledge_context']} nodes")
        print(f"  📖 Total training context: {demo_analysis['total_training_context']} examples")
        
        print(f"\n📋 Content Type Distribution:")
        for content_type, count in demo_analysis["content_types"].items():
            percentage = (count / len(results)) * 100
            print(f"  {content_type}: {count} queries ({percentage:.1f}%)")
        
        return demo_analysis
    
    def save_system_status(self, demo_results: Dict[str, Any]):
        """Save system status and capabilities."""
        system_status = {
            "standalone_wavecaster": {
                "enhanced_tokenizer_available": self.enhanced_tokenizer is not None,
                "knowledge_base_available": self.knowledge_base is not None,
                "training_data_loaded": len(self.training_data),
                "response_templates_initialized": len(self.response_templates)
            },
            "capabilities": {
                "semantic_analysis": self.enhanced_tokenizer is not None,
                "knowledge_enhanced_responses": self.knowledge_base is not None,
                "structured_response_generation": True,
                "multi_modal_processing": self.enhanced_tokenizer is not None,
                "context_integration": True,
                "batch_processing": True
            },
            "demo_results": demo_results,
            "timestamp": datetime.now().isoformat()
        }
        
        with open("standalone_wavecaster_status.json", 'w', encoding='utf-8') as f:
            json.dump(system_status, f, indent=2, ensure_ascii=False)
        
        print("✅ System status saved to standalone_wavecaster_status.json")

async def main():
    """Main function to run standalone wavecaster system."""
    print("🚀 Standalone Wavecaster System")
    print("=" * 35)
    
    # Initialize system
    wavecaster = StandaloneWavecasterSystem()
    
    # Run demo
    demo_results = await wavecaster.run_comprehensive_demo()
    
    # Save status
    wavecaster.save_system_status(demo_results)
    
    print("\n🎉 Standalone Wavecaster System Complete!")
    print("🌊 System ready for advanced AI applications!")
    
    return demo_results

if __name__ == "__main__":
    asyncio.run(main())