import re
import json

import re

class CourseOutlineParser:
    def __init__(self, file_path):
        self.file_path = file_path
        self.content = self._read_markdown()

    def _read_markdown(self):
        with open(self.file_path, 'r', encoding='utf-8') as f:
            return f.read()

    def parse_content(self):
        result = {}

        # Main Heading
        main_heading_match = re.search(r'^#\s*\**(.*?)\**\s*$', self.content, re.MULTILINE)
        if main_heading_match:
            result['main_heading'] = main_heading_match.group(1).strip()

        # Find all stage titles and their content
        stage_pattern = r'##\s*\**(Stage \d+:?.*?)\**\s*\n'
        matches = list(re.finditer(stage_pattern, self.content))

        stages = []
        for idx, match in enumerate(matches):
            stage_title = match.group(1).strip()

            # Get content between this match and next one (or end of file)
            start = match.end()
            end = matches[idx + 1].start() if idx + 1 < len(matches) else len(self.content)
            stage_content = self.content[start:end]

            focus = self._extract_section(stage_content, 'Focus')
            outcome = self._extract_section(stage_content, 'Outcome')
            duration = self._extract_section(stage_content, 'Duration')
            topics = self._extract_topics(stage_content)

            stages.append({
                'stage_title': stage_title,
                'focus': focus,
                'outcome': outcome,
                'duration': duration,
                'topics_covered': topics
            })

        result['stages'] = stages
        return result

    def _extract_section(self, content, section_name):
        pattern = rf'###\s*\**{section_name}\**\s*(.*?)\s*(?=\n###|\Z)'
        match = re.search(pattern, content, re.DOTALL)
        return match.group(1).strip() if match else None

    def _extract_topics(self, content):
        pattern = r'###\s*\**Topics Covered\**\s*(.*?)(?=\n###|\Z)'
        match = re.search(pattern, content, re.DOTALL)
        if match:
            topics_block = match.group(1).strip()
            topics = re.findall(r'-\s+(.*)', topics_block)
            return [t.strip() for t in topics]
        return []


# class CourseOutlineParser:
#     """
#     Parses a markdown-formatted course outline file and extracts structured stage-wise metadata.
#     """

#     def __init__(self, file_path):
#         self.file_path = file_path
#         self.content = self._read_markdown()

#     def _read_markdown(self):
#         """Reads the markdown file content."""
#         with open(self.file_path, 'r', encoding='utf-8') as f:
#             return f.read()
        
#     def parse_content(self):
#         """Parses the markdown content and extracts the course structure."""
#         result = {}

#         # Extract main heading (with or without **)
#         main_heading_match = re.search(r'^#\s*\**(.*?)\**\s*$', self.content, re.MULTILINE)
#         if main_heading_match:
#             result['main_heading'] = main_heading_match.group(1).strip()

#         # Split stages (with or without **)
#         stage_blocks = re.split(r'##\s*\**(Stage \d+ - .*?)\**\s*$', self.content, flags=re.MULTILINE)

#         stages = []
#         for i in range(1, len(stage_blocks), 2):
#             stage_title = stage_blocks[i].strip()
#             stage_content = stage_blocks[i+1]

#             focus = self._extract_section(stage_content, 'Focus')
#             outcome = self._extract_section(stage_content, 'Outcome')
#             duration = self._extract_section(stage_content, 'Duration')
#             topics = self._extract_topics(stage_content)

#             stages.append({
#                 'stage_title': stage_title,
#                 'focus': focus,
#                 'outcome': outcome,
#                 'duration': duration,
#                 'topics_covered': topics
#             })

#         result['stages'] = stages
#         return result

#     def _extract_section(self, content, section_name):
#         """Extracts a section like Focus, Outcome, etc."""
#         pattern = rf'###\s*\**{section_name}\**\s*(.*?)\s*(?=(###|$))'
#         match = re.search(pattern, content, re.DOTALL)
#         return match.group(1).strip() if match else None
    
#     def _extract_topics(self, content):
#         """Extracts bullet-point topics under 'Topics Covered'."""
#         pattern = r'###\s*\**Topics Covered\**\s*(.*?)(?=(###|$))'
#         match = re.search(pattern, content, re.DOTALL)
#         if match:
#             topics_block = match.group(1).strip()
#             topics = re.findall(r'-\s+(.*)', topics_block)
#             return [t.strip() for t in topics]
#         return []




    # def parse_content(self):
    #     """Parses the markdown content and extracts the course structure."""
    #     result = {}
    #     # Extract main heading
    #     # main_heading_match = re.search(r'#\s*\*\*(.*?)\*\*', self.content)
    #     main_heading_match = re.search(r'^#\s*(?:\*\*)?(.*?)(?:\*\*)?\s*$', self.content, re.MULTILINE)

    #     print(f"The main heading match is : {main_heading_match}")
    #     if main_heading_match:
    #         result['main_heading'] = main_heading_match.group(1).strip()

    #     # Split stages
    #     # stage_blocks = re.split(r'##\s*\*\*(Stage \d+.*?)\*\*', self.content)
    #     stage_blocks = re.split(r'##\s*(?:\*\*)?(Stage \d+.*?)(?:\*\*)?', self.content)


    #     # The first element is before stages, ignore it
    #     stages = []
    #     for i in range(1, len(stage_blocks), 2):
    #         stage_title = stage_blocks[i].strip()
    #         stage_content = stage_blocks[i+1]

    #         # Extract Focus
    #         focus = self._extract_section(stage_content, 'Focus')

    #         # Extract Outcome
    #         outcome = self._extract_section(stage_content, 'Outcome')

    #         # Extract Duration
    #         duration = self._extract_section(stage_content, 'Duration')

    #         # Extract Topics Covered
    #         topics = self._extract_topics(stage_content)

    #         # Add to stages list
    #         stages.append({
    #             'stage_title': stage_title,
    #             'focus': focus,
    #             'outcome': outcome,
    #             'duration': duration,
    #             'topics_covered': topics
    #         })

    #     result['stages'] = stages
    #     return result

    # def _extract_section(self, text, section_name):
    #     """Extracts a specific section (e.g., Focus, Outcome) from stage content."""
    #     pattern = fr'###\s*{section_name}\s*(.*?)\n###'
    #     match = re.search(pattern, text, re.DOTALL)
    #     return match.group(1).strip() if match else None

    # def _extract_topics(self, text):
    #     """Extracts topics covered as a list."""
    #     topics_match = re.search(r'###\s*Topics Covered\s*(.*?)($|\n##)', text, re.DOTALL)
    #     if topics_match:
    #         topics_raw = topics_match.group(1).strip()
    #         return [line.strip('- ').strip() for line in topics_raw.split('\n') if line.strip()]
    #     return []

    def export_to_json(self, output_path, parsed_data):
        """Exports the parsed result to a JSON file."""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(parsed_data, f, indent=4)


