import markdown
import json
import re
import logging
from typing import List, Dict, Any, Optional

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


class MarkdownParser:
    """
    A class to parse markdown files and extract structured content.
    """

    def __init__(self):
        """
        Initializes the MarkdownParser with necessary extensions.
        """
        self.md = markdown.Markdown(extensions=['fenced_code', 'tables', 'attr_list', 'mdx_math', 'smarty'])

    def parse_markdown(self, markdown_text: str) -> Dict[str, Any]:
        """
        Parses a markdown string and extracts structured data.

        Args:
            markdown_text (str): The markdown text to parse.

        Returns:
            Dict[str, Any]: A dictionary containing the extracted structured data.
        """
        try:
            html = self.md.convert(markdown_text)
            structured_data = self._extract_structured_data(html)
            return structured_data
        except Exception as e:
            logging.error(f"Error parsing markdown: {e}")
            return {}

    def _extract_structured_data(self, html: str) -> Dict[str, Any]:
        """
        Extracts headings, code blocks, tables, links, and lists from the HTML.

        Args:
            html (str): The HTML string generated from the markdown.

        Returns:
            Dict[str, Any]: A dictionary containing the extracted structured data.
        """
        data: Dict[str, Any] = {
            'headings': self._extract_headings(html),
            'code_blocks': self._extract_code_blocks(html),
            'tables': self._extract_tables(html),
            'links': self._extract_links(html),
            'lists': self._extract_lists(html)
        }
        return data

    def _extract_headings(self, html: str) -> List[Dict[str, str]]:
        """
        Extracts headings (h1-h6) from the HTML.

        Args:
            html (str): The HTML string.

        Returns:
            List[Dict[str, str]]: A list of dictionaries, each containing the heading level and text.
        """
        headings = []
        for i in range(1, 7):
            regex = re.compile(f'<h{i}>(.*?)</h{i}>')
            matches = regex.findall(html)
            for match in matches:
                headings.append({'level': i, 'text': match.strip()})
        return headings

    def _extract_code_blocks(self, html: str) -> List[str]:
        """
        Extracts code blocks from the HTML.

        Args:
            html (str): The HTML string.

        Returns:
            List[str]: A list of code blocks.
        """
        code_blocks = re.findall(r'<pre><code.*?>(.*?)</code></pre>', html, re.DOTALL)
        return [block.strip() for block in code_blocks]

    def _extract_tables(self, html: str) -> List[str]:
        """
        Extracts tables from the HTML.

        Args:
            html (str): The HTML string.

        Returns:
            List[str]: A list of tables (as HTML strings).
        """
        tables = re.findall(r'<table.*?>(.*?)</table>', html, re.DOTALL)
        return [table.strip() for table in tables]

    def _extract_links(self, html: str) -> List[Dict[str, str]]:
        """
        Extracts links from the HTML.

        Args:
            html (str): The HTML string.

        Returns:
            List[Dict[str, str]]: A list of dictionaries, each containing the link text and URL.
        """
        links = []
        regex = re.compile(r'<a.*?href="(.*?)".*?>(.*?)</a>')
        matches = regex.findall(html)
        for match in matches:
            links.append({'text': match[1].strip(), 'url': match[0].strip()})
        return links

    def _extract_lists(self, html: str) -> List[str]:
        """
        Extracts lists (both ordered and unordered) from the HTML.

        Args:
            html (str): The HTML string.

        Returns:
            List[str]: A list of lists (as HTML strings).
        """
        lists = re.findall(r'<(ul|ol).*?>(.*?)</(ul|ol)>', html, re.DOTALL)
        return [list_html[1].strip() for list_html in lists]

    def markdown_file_to_json(self, file_path: str) -> Optional[str]:
        """
        Reads a markdown file and converts it to a JSON string.

        Args:
            file_path (str): The path to the markdown file.

        Returns:
            Optional[str]: A JSON string representing the structured data, or None if an error occurred.
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
            structured_data = self.parse_markdown(markdown_text)
            return json.dumps(structured_data, indent=4)
        except FileNotFoundError:
            logging.error(f"File not found: {file_path}")
            return None
        except Exception as e:
            logging.error(f"Error converting markdown file to JSON: {e}")
            return None


if __name__ == '__main__':
    # Example usage:
    parser = MarkdownParser()

    # Create a dummy markdown file for testing
    markdown_content = """
# My Document

## Section 1

This is a paragraph with a [link](https://www.example.com).


def hello_world():
    print("Hello, world!")


| Header 1 | Header 2 |
| -------- | -------- |
| Cell 1   | Cell 2   |

- Item 1
- Item 2

1. Ordered Item 1
2. Ordered Item 2
"""

    with open("test.md", "w", encoding="utf-8") as f:
        f.write(markdown_content)

    json_output = parser.markdown_file_to_json("test.md")

    if json_output:
        print(json_output)
    else:
        print("Failed to parse markdown file.")
