import sys
from fastapi import HTTPException
import os
from ..utils.hashing import HashGenerator
from user_journey_service.crew import UserJourney 
from user_journey_service.processors.duration_estimator import MicrolearningDurationEstimator
from user_journey_service.processors.StagewiseCourseParser import CourseOutlineParser
from user_journey_service.processors.content_reviewer import ContentReviewer
from user_journey_service.processors.user_journey_synthesizer import Synthesizer
from user_journey_service.tools.custom_stt_tool import LiveWhisperSTTTool
# from user_journey_service.tools.custom_tts_tool import RealTime_TTS
duration_estimator = MicrolearningDurationEstimator()
content_reviewer = ContentReviewer()
synthesizer = Synthesizer()
stt_tool = LiveWhisperSTTTool()
# tts_tool = RealTime_TTS()
from pathlib import Path
import re
import whisper
from gtts import gTTS
from pydub import AudioSegment
import sys
import subprocess

# Configure pydub to use ffmpeg correctly
def configure_pydub():
    """Configure pydub with ffmpeg paths"""
    current_dir = os.path.dirname(os.path.abspath(__file__))
    
    # Look for ffmpeg in current directory or parent directories
    def find_ffmpeg():
        # Check current directory
        ffmpeg_path = os.path.join(current_dir, 'ffmpeg.exe')
        if os.path.exists(ffmpeg_path):
            return ffmpeg_path
        
        # Check parent directories (up to 3 levels)
        for i in range(1, 4):
            parent_dir = os.path.join(current_dir, *['..'] * i)
            ffmpeg_path = os.path.abspath(os.path.join(parent_dir, 'ffmpeg.exe'))
            if os.path.exists(ffmpeg_path):
                return ffmpeg_path
        
        return None
    
    ffmpeg_path = find_ffmpeg()
    
    if ffmpeg_path:
        # Set converter path
        AudioSegment.converter = ffmpeg_path
        
        # Try to find ffprobe in same directory
        ffprobe_path = os.path.join(os.path.dirname(ffmpeg_path), 'ffprobe.exe')
        if os.path.exists(ffprobe_path):
            AudioSegment.ffprobe = ffprobe_path
        else:
            # Use ffmpeg for both if ffprobe not found
            AudioSegment.ffprobe = ffmpeg_path
        
        print(f"✓ Pydub configured with: {ffmpeg_path}")
        return True
    else:
        print("⚠ Warning: ffmpeg.exe not found. Audio conversion may fail.")
        return False

# Configure pydub on module import
pydub_configured = configure_pydub()

class ContentCreationService:
    def __init__(self, input_data):
        self.crew_instance = UserJourney()
        self.input_data = input_data
        self.input_hash = HashGenerator.generate_input_hash(input_data)
        self.research_file = f"research/{self.input_hash}.md"
        self.output_file = f"output/{self.input_hash}.md"
        self.output_file_1 = f"output1/{self.input_hash}.md"
        self.output_file_2 = f"output2/{self.input_hash}.md"
        self.json_output_path = f'parsed_course_content/{self.input_hash}.json'

    
    def _has_updated_content_questions(self, module_idx, lesson_number):
        """Check if content and questions already exist for a lesson"""
        # Convert lesson_number (e.g., "1.1") to safe filename format
        lesson_safe = lesson_number.replace('.', '_')
        
        updated_content_file = Path(f'updated_content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md')
        question_file = Path(f"question/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md")
        
        return updated_content_file.exists() and question_file.exists()

    def _has_audio_content_and_questions(self, module_idx, lesson_number):
        """Check if audio files exist for a lesson"""
        print("Inside audio files availability check function")
        
        # Convert lesson_number (e.g., "1.1") to safe filename format
        lesson_safe = lesson_number.replace('.', '_')
        
        # Audio file paths
        audio_content = Path(f'audio/lessons/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.wav')
        audio_question1 = Path(f'audio/questions/{self.input_hash}/module{module_idx}_lesson{lesson_safe}/1.wav')
        audio_question2 = Path(f'audio/questions/{self.input_hash}/module{module_idx}_lesson{lesson_safe}/2.wav')
        audio_question3 = Path(f'audio/questions/{self.input_hash}/module{module_idx}_lesson{lesson_safe}/3.wav')
        
        result = (audio_content.exists() and 
                audio_question1.exists() and 
                audio_question2.exists() and 
                audio_question3.exists())
        
        print(f"The output is : {result}")
        return result 
           
    def _run_evaluation(self, idx):
        all_qns_answer = {}
        audio_content = Path(f'audio/stage/{self.input_hash}/stage{idx}.wav')
        print(f"The audio content is available at : {audio_content}")
        audio_question = Path(f'audio/questions/{self.input_hash}/stage{idx}/')
        wav_files = sorted(audio_question.glob('*.wav'))
        for wav_file in wav_files:
            print(wav_file)
            question = self.wav_to_text_whisper(wav_file)
            input("🎤 Press Enter when you're ready to answer...")
            print("📢 Listening to your answer...")
            answer = str(input("Enter your answer: "))
            inputs = {"question": question, "answer": answer}
            all_qns_answer[question] = answer
            crew = self.crew_instance.evaluator_crew()
            crew.kickoff(inputs=inputs)
        inputs = {"users_response": all_qns_answer}
        crew = self.crew_instance.assessment_crew()
        crew.kickoff(inputs=inputs)
    
    
    def _create_audio_files(self, module_idx, lesson_number):
        """Create audio files for a lesson"""
        # Convert lesson_number (e.g., "1.1") to safe filename format
        lesson_safe = lesson_number.replace('.', '_')
        
        # Content audio
        audio_content = Path(f'audio/lessons/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.mp3')
        if audio_content.exists():
            print(f"Audio for the content is already available at: {audio_content}")
        else:
            os.makedirs(os.path.dirname(f"audio/lessons/{self.input_hash}/"), exist_ok=True)
            content_file = Path(f'updated_content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md')
            with open(content_file, 'r', encoding='utf-8') as f:
                text_content = f.read()
            self.text_to_wav(text_content, audio_content)
        
        # Question audio
        audio_question_dir = Path(f'audio/questions/{self.input_hash}/module{module_idx}_lesson{lesson_safe}/')
        wav_files = sorted(audio_question_dir.glob('*.mp3'))
        if not wav_files:
            os.makedirs(audio_question_dir, exist_ok=True)
            print("Question audio is not available")
            self._create_audio_question(module_idx, lesson_number)

    def _create_audio_files_without_check(self, module_idx, lesson_number):
        """Create audio files for a lesson without checking if they exist"""
        # Convert lesson_number (e.g., "1.1") to safe filename format
        lesson_safe = lesson_number.replace('.', '_')
        
        # Create content audio
        audio_content = Path(f'audio/lessons/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.mp3')
        os.makedirs(os.path.dirname(f"audio/lessons/{self.input_hash}/"), exist_ok=True)
        content_file = Path(f'updated_content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md')
        with open(content_file, 'r', encoding='utf-8') as f:
            text_content = f.read()
        self.text_to_wav(text_content, audio_content)
        
        # Create question audio directory
        audio_question_dir = Path(f'audio/questions/{self.input_hash}/module{module_idx}_lesson{lesson_safe}/')
        os.makedirs(audio_question_dir, exist_ok=True)
        print("Question audio is not available")
        self._create_audio_question(module_idx, lesson_number)
        
    def _create_audio_files_04_01_2026(self, idx):
        audio_content = Path(f'audio/stage/{self.input_hash}/stage{idx}.wav')
        if audio_content.exists():
            print(f"Audio for the content is already available at :{audio_content}")
        else:
            os.makedirs(os.path.dirname(f"audio/stage/{self.input_hash}/"), exist_ok=True)
            content_file = Path(f'updated_content/{self.input_hash}/stage{idx}.md')
            with open(content_file, 'r', encoding='utf-8') as f:
                text_content = f.read()
            self.text_to_wav(text_content, audio_content)
        
        audio_question = Path(f'audio/questions/{self.input_hash}/stage{idx}/')
        wav_files = sorted(audio_question.glob('*.wav'))
        if not wav_files:
            os.makedirs(os.path.dirname(f"audio/questions/{self.input_hash}/stage{idx}/"), exist_ok=True)
            print("question audio is not available")
            self._create_audio_question(idx)

    
    def _create_audio_question(self, module_idx, lesson_number):
        """Create audio files for questions for a specific lesson"""
        # Convert lesson_number (e.g., "1.1") to safe filename format
        lesson_safe = lesson_number.replace('.', '_')
        
        question_file = Path(f"question/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md")
        
        if question_file.exists():
            with open(question_file, 'r', encoding='utf-8') as f:
                questions = f.read()
            
            # Parse questions properly
            parsed_questions = self.parse_questions(questions)
            print(f"Found {len(parsed_questions)} questions to convert to audio for Module {module_idx}, Lesson {lesson_number}")
            
            for index, question in enumerate(parsed_questions, start=1):
                print(f"\n🧠 Asking Question {index} for Module {module_idx}, Lesson {lesson_number}: {question[:100]}...\n")
                audio_question = Path(f'audio/questions/{self.input_hash}/module{module_idx}_lesson{lesson_safe}/{index}.wav')
                self.text_to_wav(question, audio_question)
        else:
            print(f"Question file not found: {question_file}")
            
    
    def parse_questions(self, md_text):
        """Parse questions from markdown text"""
        # Clean the text first
        lines = md_text.split('\n')
        questions = []
        current_question = []
        
        for line in lines:
            line = line.strip()
            if line.startswith('### Q') or line.startswith('### Q'):
                if current_question:
                    questions.append(' '.join(current_question).strip())
                    current_question = []
                # Remove the ### Q1: prefix
                line = re.sub(r'^### Q\d+:\s*', '', line)
                line = re.sub(r'^### Q\d+:.*?\s*', '', line)
            
            if line and not line.startswith('###'):
                current_question.append(line)
        
        if current_question:
            questions.append(' '.join(current_question).strip())
        
        # Filter out empty questions
        questions = [q for q in questions if q and len(q) > 10]
        
        # If no questions found, use a simpler regex approach
        if not questions:
            pattern = r'### Q\d+:?\s*(.*?)(?=\n### Q\d+:|$)'
            questions = re.findall(pattern, md_text, re.DOTALL | re.IGNORECASE)
            questions = [q.strip() for q in questions if q.strip()]
        
        return questions[:9]  # Return max 9 questions


    
    def intelligent_text_cleaner(self, text):
        """
        Intelligently clean text for audio conversion.
        Handles various markdown and formatting patterns.
        """
        if not text:
            return ""
        
        text = str(text)
        
        # Patterns to remove (sounds awkward in speech)
        patterns_to_remove = [
            # Markdown headings
            r'^#{1,6}\s+',
            # Bold/italic markers
            r'\*\*|\*|__|_',
            # Code blocks
            r'```.*?```',
            r'`[^`]+`',
            # Links and images
            r'!?\[.*?\]\(.*?\)',
            # HTML entities
            r'&[a-z]+;',
            # Special formatting
            r'^\s*[-*+]\s+',  # Bullet points
            r'^\s*\d+\.\s+',  # Numbered lists
            r'^>\s+',  # Blockquotes
            # Table formatting
            r'\|-+\|',
            r'^\|.*?\|$',
        ]
        
        for pattern in patterns_to_remove:
            text = re.sub(pattern, '', text, flags=re.MULTILINE | re.DOTALL)
        
        # Replace specific patterns with natural speech
        replacements = {
            # Convert headers to natural speech
            r'^#{1,3}\s+(.*?)\s*$': r'\1. ',
            # Convert bullet points to natural speech
            r'^\s*[-*+]\s+(.*?)\s*$': r'\1. ',
            r'^\s*\d+\.\s+(.*?)\s*$': r'\1. ',
            # Handle common AI output patterns
            r'^(Introduction|Conclusion|Summary|Note|Tip|Warning|Important):\s*': '',
            r'^(Step \d+|Phase \d+|Part \d+):\s*': '',
            # Clean up question patterns
            r'^(Q\d+|Question \d+)[:.]?\s*': '',
            r'^(Easy|Moderate|Difficult|Hard)[-:]?\s*': '',
        }
        
        for pattern, replacement in replacements.items():
            text = re.sub(pattern, replacement, text, flags=re.MULTILINE | re.IGNORECASE)
        
        # Remove excessive punctuation
        text = re.sub(r'[.!?]{2,}', '.', text)
        
        # Normalize whitespace
        lines = []
        for line in text.split('\n'):
            line = line.strip()
            if line:
                # Capitalize first letter if needed
                if line and line[0].islower() and len(line) > 1:
                    line = line[0].upper() + line[1:]
                lines.append(line)
        
        text = ' '.join(lines)  # Join as continuous speech
        
        # Final cleanup
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def optimal_markdown_to_speech(self, text):
        """
        Optimal markdown to speech converter - simpler but effective.
        Preserves semantic meaning without over-engineering.
        """
        if not text:
            return ""
        
        lines = text.split('\n')
        speech_lines = []
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Handle headings with natural speech
            if line.startswith('# '):
                speech_lines.append(f"Main topic: {line[2:].strip()}")
            elif line.startswith('## '):
                speech_lines.append(f"Section: {line[3:].strip()}")
            elif line.startswith('### '):
                speech_lines.append(f"Subsection: {line[4:].strip()}")
            elif line.startswith('#### '):
                speech_lines.append(f"{line[5:].strip()}")
            
            # Handle lists naturally
            elif line.startswith('- ') or line.startswith('* ') or line.startswith('+ '):
                item = line[2:].strip()
                # Clean inline formatting
                item = re.sub(r'[\*_]{1,2}(.*?)[\*_]{1,2}', r'\1', item)
                item = re.sub(r'`([^`]+)`', r'code \1', item)
                speech_lines.append(f"• {item}")
            
            elif re.match(r'^\d+\.\s+', line):
                item = re.sub(r'^\d+\.\s+', '', line)
                item = re.sub(r'[\*_]{1,2}(.*?)[\*_]{1,2}', r'\1', item)
                speech_lines.append(f"• {item}")
            
            # Handle bold/italic naturally
            else:
                # Remove formatting but keep text
                clean_line = line
                # Remove **bold** and *italic* markers
                clean_line = re.sub(r'\*\*(.*?)\*\*', r'\1', clean_line)
                clean_line = re.sub(r'\*(.*?)\*', r'\1', clean_line)
                # Remove `code` markers
                clean_line = re.sub(r'`([^`]+)`', r'code \1', clean_line)
                # Remove links but keep text
                clean_line = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean_line)
                
                if clean_line.strip():
                    speech_lines.append(clean_line)
        
        # Add natural pauses
        result = []
        for i, line in enumerate(speech_lines):
            result.append(line)
            # Add pause after headings and list items
            if (line.startswith(('Main topic:', 'Section:', 'Subsection:')) or 
                line.startswith('• ')):
                result.append('')  # Pause
        
        return '\n'.join(result).strip()
    


    def markdown_to_speech(self, text):
        """
        Convert markdown to natural speech while preserving semantic meaning.
        Handles: # Headings, **bold**, *italic*, - bullets, 1. numbered lists, `code`, [links](url)
        """
        if not text:
            return ""
        
        text = str(text)
        lines = text.split('\n')
        result_lines = []
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Handle headings with appropriate speech cues
            if line.startswith('###### '):
                # H6: Smallest heading - say with slight emphasis
                content = line[6:].strip()
                result_lines.append(f"Subsection: {content}")
                
            elif line.startswith('##### '):
                # H5: Small heading
                content = line[5:].strip()
                result_lines.append(f"Minor section: {content}")
                
            elif line.startswith('#### '):
                # H4: Minor heading
                content = line[4:].strip()
                result_lines.append(f"Section: {content}")
                
            elif line.startswith('### '):
                # H3: Subheading - say with clear emphasis
                content = line[3:].strip()
                result_lines.append(f"Topic: {content}")
                
            elif line.startswith('## '):
                # H2: Major section - say with strong emphasis
                content = line[2:].strip()
                result_lines.append(f"Chapter: {content}")
                
            elif line.startswith('# '):
                # H1: Main title - say with maximum emphasis
                content = line[1:].strip()
                result_lines.append(f"Title: {content}")
            
            # Handle bullet points (-, *, +)
            elif re.match(r'^[-*+]\s+', line):
                content = re.sub(r'^[-*+]\s+', '', line)
                # Remove any bold/italic from bullet content
                content = self._remove_inline_formatting(content)
                result_lines.append(f"• {content}")
            
            # Handle numbered lists (1., 2., etc.)
            elif re.match(r'^\d+\.\s+', line):
                match = re.match(r'^(\d+)\.\s+(.*)', line)
                if match:
                    number = match.group(1)
                    content = match.group(2)
                    content = self._remove_inline_formatting(content)
                    result_lines.append(f"Point {number}: {content}")
            
            # Handle checkboxes
            elif re.match(r'^\[[ xX]\]\s+', line):
                content = re.sub(r'^\[[ xX]\]\s+', '', line)
                content = self._remove_inline_formatting(content)
                if '[x]' in line.lower() or '[X]' in line:
                    result_lines.append(f"Completed: {content}")
                else:
                    result_lines.append(f"To do: {content}")
            
            # Handle blockquotes
            elif line.startswith('> '):
                content = line[1:].strip()
                content = self._remove_inline_formatting(content)
                result_lines.append(f"Quote: {content}")
            
            # Handle horizontal rule
            elif re.match(r'^[-*_]{3,}$', line):
                result_lines.append("---")  # Pause marker
            
            # Handle tables (simplify them)
            elif '|' in line and re.search(r'\w.*\|.*\w', line):
                # Skip table formatting lines
                if not re.match(r'^\|?[-:| ]+\|?$', line):
                    # Extract cell content
                    cells = [cell.strip() for cell in line.split('|') if cell.strip()]
                    if cells:
                        result_lines.append(f"Table row: {'; '.join(cells)}")
            
            # Regular text with inline formatting
            else:
                content = self._remove_inline_formatting(line)
                if content:
                    result_lines.append(content)
        
        # Join with appropriate pauses
        return self._add_speech_pauses('\n'.join(result_lines))

    def _remove_inline_formatting(self, text):
        """
        Remove inline markdown formatting while preserving semantic meaning.
        """
        if not text:
            return ""
        
        # Save original for reference
        original = text
        
        # Handle bold with emphasis cue
        def replace_bold(match):
            content = match.group(1)
            return f"{content}"  # In speech, we might say "emphasized" or use tone
        
        # Handle italic with slight emphasis cue
        def replace_italic(match):
            content = match.group(1)
            return f"{content}"  # Subtle emphasis in speech
        
        # Handle inline code
        def replace_inline_code(match):
            content = match.group(1)
            return f"code: {content}"
        
        # Handle strikethrough
        def replace_strikethrough(match):
            content = match.group(1)
            return f"strikethrough: {content}"
        
        # Process in order
        text = re.sub(r'\*\*(.*?)\*\*', replace_bold, text)
        text = re.sub(r'\*(.*?)\*', replace_italic, text)
        text = re.sub(r'__(.*?)__', replace_bold, text)
        text = re.sub(r'_(.*?)_', replace_italic, text)
        text = re.sub(r'`(.*?)`', replace_inline_code, text)
        text = re.sub(r'~~(.*?)~~', replace_strikethrough, text)
        
        # Handle links: [text](url) -> "text (see link)"
        text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
        
        # Handle images: ![alt](url) -> "Image: alt"
        text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'Image: \1', text)
        
        # Handle HTML entities
        text = text.replace('&nbsp;', ' ')
        text = text.replace('&amp;', 'and')
        text = text.replace('&lt;', 'less than')
        text = text.replace('&gt;', 'greater than')
        
        return text.strip()

    def _add_speech_pauses(self, text):
        """
        Add natural pauses for speech based on content structure.
        """
        lines = text.split('\n')
        result = []
        
        for i, line in enumerate(lines):
            line = line.strip()
            if not line:
                continue
            
            # Add longer pause after headings
            if line.startswith(('Title:', 'Chapter:', 'Topic:', 'Section:')):
                result.append(line)
                result.append("")  # Extra pause
            
            # Add pause after bullet points and numbered items
            elif line.startswith(('• ', 'Point ', 'Completed:', 'To do:')):
                result.append(line)
                if i < len(lines) - 1 and not lines[i+1].startswith(('• ', 'Point ')):
                    result.append("")  # Pause after list ends
            
            # Add slight pause after regular lines
            else:
                result.append(line)
                # Don't add pause if next line is continuation
                if i < len(lines) - 1 and lines[i+1].strip() and not lines[i+1].startswith(('• ', 'Point ', 'Title:', 'Chapter:')):
                    result.append("")  # Small pause
        
        return '\n'.join(result)

    def text_to_wav_old(self, text, output_file):
        """Convert text to WAV audio file - SIMPLIFIED VERSION"""
        print(f"Converting text to audio: {output_file}")
    
        try:
            # Clean text
            text = str(text).strip()
            text = self.intelligent_text_cleaner(text)
            #text = self.optimal_markdown_to_speech(text)
            #text = self.markdown_to_speech(text)
            if not text:
                print("Warning: Empty text, skipping audio creation")
                return None
            
            # Create unique temp file name to avoid conflicts
            import uuid
            temp_mp3 = f"temp_audio_{uuid.uuid4().hex[:8]}.mp3"
            
            # Convert text to mp3 using gTTS
            print(f"Creating MP3 with gTTS: {temp_mp3}")
            tts = gTTS(text=text, lang='en', slow=False)
            tts.save(temp_mp3)
            print("✓ gTTS conversion complete")
            
            # Save as MP3 directly (skip WAV conversion)
            mp3_output = str(output_file).replace('.wav', '.mp3')
            
            # Move the temp file to final location
            if os.path.exists(temp_mp3):
                os.replace(temp_mp3, mp3_output)
                print(f"✓ Audio saved as {mp3_output}")
                
                # Create a placeholder WAV file if needed
                #if output_file.endswith('.wav'):
                #    with open(output_file, 'w') as f:
                #        f.write(f"Audio available at: {mp3_output}")
                #    print(f"✓ Created placeholder WAV file: {output_file}")
                
                return mp3_output
                
        except Exception as e:
            print(f"❌ Error in text_to_wav: {e}")
            import traceback
            traceback.print_exc()
            
            # Last resort: save text to file
            error_file = str(output_file) + ".error.txt"
            with open(error_file, 'w', encoding='utf-8') as f:
                f.write(f"Error: {e}\n\nText: {text[:500]}")
            print(f"Saved error details to: {error_file}")
            
        return None
    
    
    
    def text_to_wav(self, text, output_file):
        """Convert text to WAV audio file - FIXED VERSION"""
        print(f"Converting text to audio: {output_file}")
        
        try:
            # Clean text
            text = str(text).strip()
            text = self.intelligent_text_cleaner(text)
            
            if not text:
                print("Warning: Empty text, skipping audio creation")
                return None
            
            # Create unique temp file name
            import uuid
            temp_mp3 = f"temp_audio_{uuid.uuid4().hex[:8]}.mp3"
            
            # Convert text to mp3 using gTTS
            print(f"Creating MP3 with gTTS: {temp_mp3}")
            tts = gTTS(text=text, lang='en', slow=False)
            tts.save(temp_mp3)
            print("✓ gTTS conversion complete")
            
            # Ensure output directory exists
            output_file = Path(output_file)
            output_file.parent.mkdir(parents=True, exist_ok=True)
            print(f"✓ Directory created/verified: {output_file.parent}")
            
            # Move the temp file to final location
            if os.path.exists(temp_mp3):
                # Use shutil.move instead of os.replace for better cross-platform compatibility
                import shutil
                shutil.move(temp_mp3, str(output_file))
                print(f"✓ Audio saved as {output_file}")
                return str(output_file)
            else:
                print(f"❌ Temp file not found: {temp_mp3}")
                
        except Exception as e:
            print(f"❌ Error in text_to_wav: {e}")
            import traceback
            traceback.print_exc()
            
            # Last resort: save text to file
            error_file = str(output_file) + ".error.txt"
            try:
                with open(error_file, 'w', encoding='utf-8') as f:
                    f.write(f"Error: {e}\n\nText: {text[:500]}")
                print(f"Saved error details to: {error_file}")
            except:
                print(f"Could not save error file")
                
        return None
    



    def wav_to_text_whisper(self, wav_file):
        print("inside wave to text conversion")
        model = whisper.load_model("base")
        result = model.transcribe(str(wav_file))
        print("\U0001F3A4 Transcription:", result["text"])
        return result["text"]
    


    def run_audio_creation(self):
        """Creates audio files for existing content and questions"""
        try:
            if not os.path.exists(self.output_file):
                return {"status": "failure", "message": "User journey is not created."}

            print("Starting audio creation process...")
            parser = CourseOutlineParser(self.output_file)
            parsed_result = parser.parse_content()
            print(f"Course Title: {parsed_result['course_title']}\n")
            course_title = parsed_result['course_title']
            
            audio_stats = {
                "total_lessons": 0,
                "content_audio_created": 0,
                "question_audio_created": 0,
                "skipped_content": 0,
                "skipped_questions": 0
            }
            
            # Iterate through modules
            for module_idx, module in enumerate(parsed_result['modules'], start=1):
                module_title = module['module_title']
                module_name = re.sub(r'^Module \d+:\s*', '', module_title).strip()
                
                print(f"Processing Module {module_idx}: {module_name}")
                
                # Iterate through lessons in this module
                for lesson_idx, lesson in enumerate(module['lessons'], start=1):
                    lesson_title_full = lesson['lesson_title']
                    
                    # Extract lesson number (e.g., "1.1", "1.2") from lesson title
                    lesson_number_match = re.search(r'Lesson (\d+\.\d+)', lesson_title_full)
                    if lesson_number_match:
                        lesson_number = lesson_number_match.group(1)
                    else:
                        lesson_number = f"{module_idx}.{lesson_idx}"
                    
                    # Extract lesson name
                    lesson_name = re.sub(r'^Lesson \d+\.\d+:\s*', '', lesson_title_full).strip()
                    
                    audio_stats["total_lessons"] += 1
                    
                    print(f"  Processing Lesson {lesson_number}: {lesson_name}")
                    
                    # Create audio files for this lesson
                    lesson_audio_stats = self._create_lesson_audio_files(module_idx, lesson_number)
                    
                    # Update stats
                    audio_stats["content_audio_created"] += lesson_audio_stats["content_created"]
                    audio_stats["question_audio_created"] += lesson_audio_stats["questions_created"]
                    audio_stats["skipped_content"] += lesson_audio_stats["content_skipped"]
                    audio_stats["skipped_questions"] += lesson_audio_stats["questions_skipped"]
            
            # Return summary
            return {
                "status": "success",
                "message": "Audio creation completed.",
                "hashid": self.input_hash,
                "stats": audio_stats,
                "summary": f"Created {audio_stats['content_audio_created']} content audio files and {audio_stats['question_audio_created']} question audio files for {audio_stats['total_lessons']} lessons."
            }

        except Exception as e:
            import traceback
            error_details = traceback.format_exc()
            print(f"ERROR in run_audio_creation: {e}")
            print(f"Traceback: {error_details}")
            raise HTTPException(status_code=500, detail=f"Unexpected error in audio creation: {e}")
    

    def _create_lesson_audio_files(self, module_idx, lesson_number):
        """Create audio files for a specific lesson and return stats"""
        stats = {
            "content_created": 0,
            "questions_created": 0,
            "content_skipped": 0,
            "questions_skipped": 0
        }
        
        # Convert lesson_number to safe filename format
        lesson_safe = lesson_number.replace('.', '_')
        
        # 1. Create content audio if it doesn't exist
        content_audio_path = Path(f'audio/lessons/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.mp3')
        content_file = Path(f'updated_content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md')
        
        if content_file.exists():
            if not content_audio_path.exists():
                print(f"    Creating content audio...")
                try:
                    with open(content_file, 'r', encoding='utf-8') as f:
                        text_content = f.read()
                    self.text_to_wav(text_content, content_audio_path)
                    stats["content_created"] += 1
                    print(f"    ✓ Content audio created")
                except Exception as e:
                    print(f"    ✗ Error creating content audio: {e}")
            else:
                print(f"    Content audio already exists")
                stats["content_skipped"] += 1
        else:
            print(f"    Warning: Content file not found: {content_file}")
        
        # 2. Create question audio if they don't exist
        question_audio_dir = Path(f'audio/questions/{self.input_hash}/module{module_idx}_lesson{lesson_safe}/')
        question_file = Path(f"question/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md")
        
        if question_file.exists():
            # Check existing audio files
            existing_audio = sorted(question_audio_dir.glob('*.mp3')) if question_audio_dir.exists() else []
            
            if not question_audio_dir.exists():
                question_audio_dir.mkdir(parents=True, exist_ok=True)
            
            if existing_audio:
                print(f"    {len(existing_audio)} question audio files already exist")
                stats["questions_skipped"] += len(existing_audio)
            else:
                print(f"    Creating question audio files...")
                try:
                    with open(question_file, 'r', encoding='utf-8') as f:
                        questions = f.read()
                    
                    parsed_questions = self.parse_questions(questions)
                    created_count = 0
                    
                    for index, question in enumerate(parsed_questions, start=1):
                        audio_question_path = question_audio_dir / f"{index}.mp3"
                        if not audio_question_path.exists():
                            self.text_to_wav(question, audio_question_path)
                            created_count += 1
                    
                    stats["questions_created"] += created_count
                    print(f"    ✓ Created {created_count} question audio files")
                    
                except Exception as e:
                    print(f"    ✗ Error creating question audio: {e}")
        else:
            print(f"    Warning: Question file not found: {question_file}")
        
        return stats


    def run_content_creation_23012026(self):
        """Creates content for each module and lesson (NO audio creation)"""
        try:
            if not os.path.exists(self.output_file):
                return {"status": "failure", "message": "User journey is not created."}

            if os.path.exists(self.output_file):
                print("The user journey is created")
                parser = CourseOutlineParser(self.output_file)
                parsed_result = parser.parse_content()
                os.makedirs(os.path.dirname(self.json_output_path), exist_ok=True)
                parser.export_to_json(self.json_output_path, parsed_result)
                print(f"Course Title: {parsed_result['course_title']}\n")
                course_title = parsed_result['course_title']
                
                content_stats = {
                    "total_lessons": 0,
                    "content_created": 0,
                    "content_updated": 0,
                    "questions_created": 0,
                    "questions_updated": 0
                }
                
                # Iterate through modules
                for module_idx, module in enumerate(parsed_result['modules'], start=1):
                    module_title = module['module_title']
                    module_focus = module['focus']
                    module_outcome = module['outcome']
                    module_duration = module['duration']
                    
                    print(f"Processing Module {module_idx}: {module_title}")
                    
                    # Extract module name
                    module_name = re.sub(r'^Module \d+:\s*', '', module_title).strip()
                    
                    # Iterate through lessons in this module
                    for lesson_idx, lesson in enumerate(module['lessons'], start=1):
                        lesson_title_full = lesson['lesson_title']
                        lesson_duration = lesson['duration']
                        lesson_outcome = lesson['outcome']
                        lesson_topics = lesson['topics']
                        
                        # Extract lesson number
                        lesson_number_match = re.search(r'Lesson (\d+\.\d+)', lesson_title_full)
                        if lesson_number_match:
                            lesson_number = lesson_number_match.group(1)
                        else:
                            lesson_number = f"{module_idx}.{lesson_idx}"
                        
                        # Extract lesson name
                        lesson_name = re.sub(r'^Lesson \d+\.\d+:\s*', '', lesson_title_full).strip()
                        
                        content_stats["total_lessons"] += 1
                        
                        print(f"  Processing Lesson {lesson_number}: {lesson_name}")
                        
                        # Calculate word count based on lesson duration
                        duration_match = re.search(r'(\d+)', lesson_duration)
                        if duration_match:
                            minutes = int(duration_match.group())
                            lower_range = minutes * 120
                            upper_range = minutes * 130
                        else:
                            lower_range = 20 * 120
                            upper_range = 20 * 130
                        
                        # Check if content already exists
                        content_file = f"content/{self.input_hash}/module{module_idx}_lesson{lesson_number.replace('.', '_')}.md"
                        question_file = f"question/{self.input_hash}/module{module_idx}_lesson{lesson_number.replace('.', '_')}.md"
                        
                        if self._should_create_content(module_idx, lesson_number):
                            print("    Creating new content and questions...")
                            result = self._create_new_lesson(
                                module_idx=module_idx,
                                module_name=module_name,
                                module_focus=module_focus,
                                module_outcome=module_outcome,
                                module_duration=module_duration,
                                lesson_number=lesson_number,
                                lesson_name=lesson_name,
                                lesson_duration=lesson_duration,
                                lesson_outcome=lesson_outcome,
                                lesson_topics=lesson_topics,
                                course_title=course_title,
                                word_count_lower=lower_range,
                                word_count_upper=upper_range
                            )
                            content_stats["content_created"] += 1
                            content_stats["questions_created"] += 1
                        else:
                            print("    Content and questions already exist")
                            content_stats["content_updated"] += 1
                            content_stats["questions_updated"] += 1
                
                return {
                    "status": "success", 
                    "message": "Content and questions created (audio will be created separately).", 
                    "hashid": self.input_hash,
                    "stats": content_stats,
                    "next_step": "Call /run-audiocreation to create audio files"
                }

        except Exception as e:
            raise HTTPException(status_code=500, detail=f"Error parsing result: {e}")

    def _should_create_content(self, module_idx, lesson_number):
        """Check if content needs to be created"""
        lesson_safe = lesson_number.replace('.', '_')
        
        # Check if both content and question files exist
        content_exists = os.path.exists(f"content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md")
        question_exists = os.path.exists(f"question/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md")
        
        # If it's first iteration or files don't exist, create them
        if self._is_first_iteration() and (not content_exists or not question_exists):
            return True
        elif not self._is_first_iteration():
            return True
        return False
    

    def run_content_creation(self):
        """Creates content for each module and lesson (NO audio creation)"""
        try:
            if not os.path.exists(self.output_file):
                return {"status": "failure", "message": "User journey is not created."}

            if os.path.exists(self.output_file):
                print("The user journey is created")
                parser = CourseOutlineParser(self.output_file)
                parsed_result = parser.parse_content()
                os.makedirs(os.path.dirname(self.json_output_path), exist_ok=True)
                parser.export_to_json(self.json_output_path, parsed_result)
                print(f"Course Title: {parsed_result['course_title']}\n")
                course_title = parsed_result['course_title']
                
                content_stats = {
                    "total_lessons": 0,
                    "content_created": 0,
                    "content_updated": 0,
                    "questions_created": 0,
                    "questions_updated": 0,
                    "word_count_removed": 0,
                    "special_chars_cleaned": 0
                }
                
                # Iterate through modules
                for module_idx, module in enumerate(parsed_result['modules'], start=1):
                    module_title = module['module_title']
                    module_focus = module['focus']
                    module_outcome = module['outcome']
                    module_duration = module['duration']
                    
                    print(f"Processing Module {module_idx}: {module_title}")
                    
                    # Extract module name
                    module_name = re.sub(r'^Module \d+:\s*', '', module_title).strip()
                    
                    # Iterate through lessons in this module
                    for lesson_idx, lesson in enumerate(module['lessons'], start=1):
                        lesson_title_full = lesson['lesson_title']
                        lesson_duration = lesson['duration']
                        lesson_outcome = lesson['outcome']
                        lesson_topics = lesson['topics']
                        
                        # Extract lesson number
                        lesson_number_match = re.search(r'Lesson (\d+\.\d+)', lesson_title_full)
                        if lesson_number_match:
                            lesson_number = lesson_number_match.group(1)
                        else:
                            lesson_number = f"{module_idx}.{lesson_idx}"
                        
                        # Extract lesson name
                        lesson_name = re.sub(r'^Lesson \d+\.\d+:\s*', '', lesson_title_full).strip()
                        
                        content_stats["total_lessons"] += 1
                        
                        print(f"  Processing Lesson {lesson_number}: {lesson_name}")
                        
                        # Calculate word count based on lesson duration
                        duration_match = re.search(r'(\d+)', lesson_duration)
                        if duration_match:
                            minutes = int(duration_match.group())
                            lower_range = minutes * 120
                            upper_range = minutes * 130
                        else:
                            lower_range = 20 * 120
                            upper_range = 20 * 130
                        
                        # Check if content already exists
                        lesson_safe = lesson_number.replace('.', '_')
                        content_file = f"content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md"
                        question_file = f"question/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md"
                        
                        if self._should_create_content(module_idx, lesson_number):
                            print("    Creating new content and questions...")
                            result = self._create_new_lesson(
                                module_idx=module_idx,
                                module_name=module_name,
                                module_focus=module_focus,
                                module_outcome=module_outcome,
                                module_duration=module_duration,
                                lesson_number=lesson_number,
                                lesson_name=lesson_name,
                                lesson_duration=lesson_duration,
                                lesson_outcome=lesson_outcome,
                                lesson_topics=lesson_topics,
                                course_title=course_title,
                                word_count_lower=lower_range,
                                word_count_upper=upper_range
                            )
                            content_stats["content_created"] += 1
                            content_stats["questions_created"] += 1
                            
                            # Clean the created files
                            if os.path.exists(content_file):
                                cleaned = self._clean_content_file(content_file)
                                if cleaned:
                                    content_stats["word_count_removed"] += 1
                                    if cleaned.get("special_chars_cleaned"):
                                        content_stats["special_chars_cleaned"] += 1
                            
                            if os.path.exists(question_file):
                                cleaned = self._clean_content_file(question_file)
                                if cleaned:
                                    content_stats["word_count_removed"] += 1
                                    if cleaned.get("special_chars_cleaned"):
                                        content_stats["special_chars_cleaned"] += 1
                        else:
                            print("    Content and questions already exist")
                            content_stats["content_updated"] += 1
                            content_stats["questions_updated"] += 1
                
                return {
                    "status": "success", 
                    "message": "Content and questions created (audio will be created separately).", 
                    "hashid": self.input_hash,
                    "stats": content_stats,
                    "next_step": "Call /run-audiocreation to create audio files"
                }

        except Exception as e:
            raise HTTPException(status_code=500, detail=f"Error parsing result: {e}")

    def _should_create_content(self, module_idx, lesson_number):
        """Check if content needs to be created"""
        lesson_safe = lesson_number.replace('.', '_')
        
        # Check if both content and question files exist
        content_exists = os.path.exists(f"content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md")
        question_exists = os.path.exists(f"question/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md")
        
        # If it's first iteration or files don't exist, create them
        if self._is_first_iteration() and (not content_exists or not question_exists):
            return True
        elif not self._is_first_iteration():
            return True
        return False

    def _is_first_iteration(self):
        print(f"Inside feedback test : {self.input_data.feedback}")
        if self.input_data.feedback == "First iteration":
            return True
        return False

    def _clean_content_file(self, file_path):
        """Clean content file by removing word counts and special characters"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            original_content = content
            cleaned = False
            special_chars_cleaned = False
            
            # 1. Remove word count lines
            word_count_patterns = [
                r'\n*(?:Word\s*Count|word\s*count|WORD\s*COUNT)\s*:?\s*\d+\s*\n*',
                r'\n*CONTENT_(?:REJECTED|TOO_SHORT).*?\n*(?=\n|$)',
                r'\n*###?\s*(?:Validation|Word Count|Feedback).*?(?=\n###|\n##|\n#|\Z)',
                r'\n*(?:Reading time|Estimated reading).*?\n*',
                r'\n*(?:Retry|Attempt|Regeneration).*?\n*',
                r'\n\s*Total\s*$',
                r'\n\s*Total\s*\n*$',
            ]
            
            for pattern in word_count_patterns:
                new_content = re.sub(pattern, '\n', content, flags=re.IGNORECASE | re.DOTALL)
                if new_content != content:
                    cleaned = True
                    content = new_content
            
            # 2. Remove special/control characters
            # Keep only printable characters + basic punctuation + newlines
            import string
            
            # Define allowed characters
            allowed_chars = set(string.printable)  # All printable ASCII
            # Add common Unicode characters for different languages
            allowed_chars.update('—–•°´`‘’"“”')  # Common typographic marks
            
            # Remove control characters except newline, tab
            cleaned_content = []
            for char in content:
                if char == '\n' or char == '\t' or char == '\r':
                    cleaned_content.append(char)
                elif char in allowed_chars:
                    cleaned_content.append(char)
                else:
                    # Replace with space or remove
                    cleaned_content.append(' ')
                    special_chars_cleaned = True
            
            content = ''.join(cleaned_content)
            
            # 3. Remove extra whitespace
            content = re.sub(r'[ \t]+', ' ', content)  # Multiple spaces/tabs to single space
            content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)  # Multiple blank lines to double
            
            # 4. Fix markdown formatting
            content = re.sub(r'\*\*\s+', '**', content)  # Remove spaces after **
            content = re.sub(r'\s+\*\*', '**', content)  # Remove spaces before **
            content = re.sub(r'#\s+', '# ', content)     # Ensure single space after #
            
            # 5. Remove trailing whitespace from lines
            lines = [line.rstrip() for line in content.split('\n')]
            content = '\n'.join(lines)
            
            # 6. Remove BOM and other invisible characters
            content = content.replace('\ufeff', '')  # Remove BOM
            
            original_content = self._remove_trailing_metadata(content)
            if content != original_content:
                # Save cleaned content
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(content)
                
                print(f"    ✓ Cleaned: {os.path.basename(file_path)}")
                return {
                    "cleaned": True,
                    "special_chars_cleaned": special_chars_cleaned
                }
            
            return {"cleaned": False}
            
        except Exception as e:
            print(f"    ✗ Error cleaning {file_path}: {e}")
            return {"cleaned": False, "error": str(e)}

    def _create_new_lesson(self, module_idx, module_name, module_focus, module_outcome, module_duration,
                        lesson_number, lesson_name, lesson_duration, lesson_outcome, lesson_topics,
                        course_title, word_count_lower, word_count_upper):
        """Create content for a new lesson WITH ENHANCED PROMPT"""
    
        inputs = self.input_data.dict()
        
        # Extract minutes from lesson duration for word count calculation
        duration_minutes_match = re.search(r'(\d+)', lesson_duration)
        if duration_minutes_match:
            minutes = int(duration_minutes_match.group())
        else:
            minutes = 20  # Default
        
        # Prepare file paths
        lesson_safe = lesson_number.replace('.', '_')
        content_file = f"content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md"
        question_file = f"question/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md"
        
        # Prepare complete data structure
        complete_data = {
            "Course": course_title,
            "Module": {
                "number": module_idx,
                "name": module_name,
                "focus": module_focus,
                "outcome": module_outcome,
                "duration": module_duration
            },
            "Lesson": {
                "number": lesson_number,
                "title": lesson_name,
                "duration": lesson_duration,
                "objectives": lesson_outcome,
                "topics": lesson_topics
            }
        }
        
        print(f"The user journey for content creation is: {complete_data}")
        print(f"The word count for content creation is: {word_count_lower} to {word_count_upper}")
        print(f"The level and motive for content creation is: {self.input_data.Level} and {self.input_data.motive}")
        
        # Prepare inputs for the crew - ADD STRICT INSTRUCTIONS
        inputs.update({
            "course_title": course_title,
            "module_number": str(module_idx),
            "module_name": module_name,
            "module_focus": module_focus,
            "module_outcome": module_outcome,
            "lesson_number": lesson_number,
            "lesson_title": lesson_name,
            "lesson_duration": minutes,
            "lesson_objectives": lesson_outcome,
            "lesson_topics": lesson_topics,
            "user_journey": complete_data,
            "word_count": f"{word_count_lower} to {word_count_upper}",
            # ADD STRICT FORMATTING INSTRUCTIONS
            "strict_formatting": """
            ## CRITICAL FORMATTING RULES - MUST FOLLOW:
            
            1. **NO WORD COUNT REFERENCES:**
            - DO NOT include "Word Count:", "word count:", "Total words:" or any word count information
            - DO NOT mention word count validation in the content
            - DO NOT add word count numbers anywhere
            - DO NOT add standalone word "Total" at the end of content
            
            2. **NO VALIDATION COMMENTS:**
            - DO NOT include "CONTENT_REJECTED", "CONTENT_TOO_SHORT", or validation feedback
            - DO NOT mention validation or approval processes
            - Only output the actual lesson content
            
            3. **CLEAN SPECIAL CHARACTERS:**
            - Use standard ASCII characters only
            - Avoid Unicode special characters that may cause display issues
            - Use standard punctuation: , . ! ? : ; " '
            
            4. **PROPER MARKDOWN FORMATTING:**
            - Use consistent heading levels: # for main title, ## for sections, ### for subsections
            - Use proper line breaks and spacing
            - Ensure all markdown syntax is correctly formatted
            
            5. **OUTPUT FORMAT:**
            - Only output the lesson content in clean markdown
            - No extra comments, notes, or explanations
            - No word count information
            - No validation messages
            """
        })
        
        # Create content using crew
        print(f"Creating content file: {content_file}")
        crew = self.crew_instance.second_stage_crew(output_file=content_file)
        crew.kickoff(inputs=inputs)
        
        # Clean the content immediately after creation
        self._clean_content_file(content_file)
        
        # Read and review the created content
        with open(content_file, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Format topics for the reviewer
        topic_sections = ", and ".join([f"'{topic}'" for topic in lesson_topics])
        print(f"The topic sections are: {topic_sections}")
        
        # Review and enrich the content
        updated_data = content_reviewer.review_and_enrich_content(
            topic_sections, course_title, complete_data, text, f"{word_count_lower} to {word_count_upper}"
        )
        
        # Clean the reviewer output too
        updated_data = self._clean_text_content(updated_data)
        
        # Create questions using crew
        crew = self.crew_instance.qa_gen_crew(output_file=question_file)
        inputs["content"] = updated_data
        # Add strict instructions for question generation too
        inputs["question_instructions"] = """
        ## IMPORTANT FORMATTING RULES:
        - DO NOT include word count or validation comments
        - Use clean, standard characters only
        - Format questions clearly with ### Q1:, ### Q2:, etc.
        - No extra text or explanations
        """
        crew.kickoff(inputs=inputs)
        
        # Clean the question file
        self._clean_content_file(question_file)
        
        # Save updated content
        updated_content_path = f'updated_content/{self.input_hash}/module{module_idx}_lesson{lesson_safe}.md'
        os.makedirs(os.path.dirname(updated_content_path), exist_ok=True)
        with open(updated_content_path, 'w', encoding='utf-8') as out_file:
            out_file.write(updated_data)
        
        # Clean the updated content file too
        self._clean_content_file(updated_content_path)
        
        return "Content and questions created and cleaned successfully."

    def _clean_text_content(self, text):
        """Clean text content by removing word counts, special characters, and trailing 'Total'"""
        if not text:
            return text
        
        text = self._remove_trailing_metadata(text)

        # 1. Remove word count patterns
        patterns_to_remove = [
            r'\n*(?:Word\s*Count|word\s*count|WORD\s*COUNT)\s*:?\s*\d+\s*\n*',
            r'\n*CONTENT_(?:REJECTED|TOO_SHORT).*?\n*(?=\n|$)',
            r'\n*###?\s*(?:Validation|Word Count|Feedback).*?(?=\n###|\n##|\n#|\Z)',
        ]
        
        for pattern in patterns_to_remove:
            text = re.sub(pattern, '\n', text, flags=re.IGNORECASE | re.DOTALL)
        
        # 2. Remove standalone "Total" at the end of content
        # Remove "Total" followed by end of string or whitespace
        text = re.sub(r'\n\s*Total\s*$', '', text, flags=re.IGNORECASE)
        text = re.sub(r'\n\s*Total\s*\n*$', '', text, flags=re.IGNORECASE)
        
        # 3. Remove special/control characters
        import string
        
        # Keep printable characters + basic Unicode marks
        allowed = set(string.printable)
        allowed.update('—–•°´`‘’"“”')  # Common typographic marks
        
        cleaned_chars = []
        for char in text:
            if char == '\n' or char == '\t' or char == '\r':
                cleaned_chars.append(char)
            elif char in allowed:
                cleaned_chars.append(char)
            else:
                # Replace with space
                cleaned_chars.append(' ')
        
        text = ''.join(cleaned_chars)
        
        # 4. Clean up whitespace
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        
        # 5. Remove BOM
        text = text.replace('\ufeff', '')
        
        # 6. Remove trailing "Total" more aggressively
        # Split by lines and remove lines that only contain "Total"
        lines = text.split('\n')
        cleaned_lines = []
        for line in lines:
            stripped_line = line.strip()
            # Skip lines that are just "Total" (case-insensitive)
            if stripped_line and stripped_line.lower() != 'total':
                cleaned_lines.append(line)
            elif stripped_line and stripped_line.lower() == 'total':
                # Skip this line (don't add it)
                continue
        
        text = '\n'.join(cleaned_lines)
        
        return text.strip()
    


    def _remove_trailing_metadata(self, text):
        """Remove trailing metadata words and phrases from text - SIMPLER VERSION"""
        if not text:
            return text
        
        original_text = text
        
        # Pattern 1: Remove "Total" (case-insensitive) possibly followed by "words"
        # Handles: "Total", "Total words", "Total\nwords", "Total \n words"
        patterns = [
            # "Total" followed by optional whitespace and optional "words"
            r'(?:\n\s*)?[Tt][Oo][Tt][Aa][Ll]\s*(?:[Ww][Oo][Rr][Dd][Ss]?\s*)?$',
            # "Total" on its own line, followed by "words" on next line
            r'\n\s*[Tt][Oo][Tt][Aa][Ll]\s*\n\s*[Ww][Oo][Rr][Dd][Ss]?\s*$',
            # Any line ending with "Total" or "words"
            r'\n\s*[Tt][Oo][Tt][Aa][Ll]\s*$',
            r'\n\s*[Ww][Oo][Rr][Dd][Ss]?\s*$',
            # Number followed by "words" or "total"
            r'\n\s*\d+\s*(?:words?|total)\s*$',
        ]
        
        for pattern in patterns:
            text = re.sub(pattern, '', text)
        
        # Remove any lines that are just metadata words
        lines = text.split('\n')
        cleaned_lines = []
        
        metadata_words = {'total', 'words', 'word', 'count', 'summary', 'conclusion', 'end'}
        
        for line in lines:
            stripped = line.strip().lower()
            
            # Check if line is just a metadata word
            if stripped in metadata_words:
                continue
            
            # Check if line starts with metadata word followed by colon or space
            skip_line = False
            for word in metadata_words:
                if (stripped.startswith(word + ':') or 
                    stripped.startswith(word + ' ') or
                    stripped == word):
                    skip_line = True
                    break
            
            if not skip_line:
                cleaned_lines.append(line)
        
        text = '\n'.join(cleaned_lines)
        
        # Final cleanup: Remove any remaining "Total words" patterns
        text = re.sub(r'\n.*[Tt][Oo][Tt][Aa][Ll].*[Ww][Oo][Rr][Dd][Ss]?.*$', '', text)
        
        return text.strip()
