Source code for genro_treestore.builders.html

# Copyright 2025 Softwell S.r.l. - Genropy Team
# SPDX-License-Identifier: Apache-2.0

"""HtmlBuilder - HTML5 element builder with content model validation.

This module provides builders for generating HTML5 documents with
structural validation based on the WHATWG HTML Living Standard.

Content Categories:
    HTML5 defines several content categories that determine where
    elements can appear and what they can contain:

    - **Metadata content**: Elements for document metadata (head section)
    - **Flow content**: Most elements that can appear in body
    - **Phrasing content**: Text-level semantics (inline elements)
    - **Heading content**: Section headings (h1-h6, hgroup)
    - **Sectioning content**: Document outline elements
    - **Embedded content**: External resources (img, video, etc.)
    - **Interactive content**: User interaction elements

References:
    - WHATWG HTML Standard: https://html.spec.whatwg.org/
    - Content categories: https://html.spec.whatwg.org/dev/dom.html

Example:
    Creating an HTML document::

        from genro_treestore import TreeStore
        from genro_treestore.builders import HtmlBuilder

        store = TreeStore(builder=HtmlBuilder())
        body = store.body()
        div = body.div(id='main', class_='container')
        div.h1(value='Welcome')
        div.p(value='Hello, World!')
        ul = div.ul()
        ul.li(value='Item 1')
        ul.li(value='Item 2')
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Callable

from .base import BuilderBase

if TYPE_CHECKING:
    from ..store import TreeStore
    from ..store import TreeStoreNode


# =============================================================================
# HTML5 Content Categories
# Based on WHATWG HTML Standard: https://html.spec.whatwg.org/dev/dom.html
# =============================================================================

# Void elements - self-closing, cannot have children or text content
VOID_ELEMENTS = frozenset(
    {
        "area",
        "base",
        "br",
        "col",
        "embed",
        "hr",
        "img",
        "input",
        "link",
        "meta",
        "source",
        "track",
        "wbr",
    }
)

# Metadata content - elements for document metadata (in <head>)
METADATA_CONTENT = frozenset(
    {"base", "link", "meta", "noscript", "script", "style", "template", "title"}
)

# Flow content - most elements allowed in <body>
# This is the largest category, containing block and inline elements
FLOW_CONTENT = frozenset(
    {
        "a",
        "abbr",
        "address",
        "article",
        "aside",
        "audio",
        "b",
        "bdi",
        "bdo",
        "blockquote",
        "br",
        "button",
        "canvas",
        "cite",
        "code",
        "data",
        "datalist",
        "del",
        "details",
        "dfn",
        "dialog",
        "div",
        "dl",
        "em",
        "embed",
        "fieldset",
        "figure",
        "footer",
        "form",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "header",
        "hgroup",
        "hr",
        "i",
        "iframe",
        "img",
        "input",
        "ins",
        "kbd",
        "label",
        "main",
        "map",
        "mark",
        "math",
        "menu",
        "meter",
        "nav",
        "noscript",
        "object",
        "ol",
        "output",
        "p",
        "picture",
        "pre",
        "progress",
        "q",
        "ruby",
        "s",
        "samp",
        "script",
        "search",
        "section",
        "select",
        "slot",
        "small",
        "span",
        "strong",
        "sub",
        "sup",
        "svg",
        "table",
        "template",
        "textarea",
        "time",
        "u",
        "ul",
        "var",
        "video",
        "wbr",
    }
)

# Phrasing content - text-level semantics (roughly: inline elements)
PHRASING_CONTENT = frozenset(
    {
        "a",
        "abbr",
        "audio",
        "b",
        "bdi",
        "bdo",
        "br",
        "button",
        "canvas",
        "cite",
        "code",
        "data",
        "datalist",
        "del",
        "dfn",
        "em",
        "embed",
        "i",
        "iframe",
        "img",
        "input",
        "ins",
        "kbd",
        "label",
        "map",
        "mark",
        "math",
        "meter",
        "noscript",
        "object",
        "output",
        "picture",
        "progress",
        "q",
        "ruby",
        "s",
        "samp",
        "script",
        "select",
        "slot",
        "small",
        "span",
        "strong",
        "sub",
        "sup",
        "svg",
        "template",
        "textarea",
        "time",
        "u",
        "var",
        "video",
        "wbr",
    }
)

# Heading content - section headings
HEADING_CONTENT = frozenset({"h1", "h2", "h3", "h4", "h5", "h6", "hgroup"})

# Sectioning content - elements that define document outline
SECTIONING_CONTENT = frozenset({"article", "aside", "nav", "section"})

# Embedded content - external resources
EMBEDDED_CONTENT = frozenset(
    {
        "audio",
        "canvas",
        "embed",
        "iframe",
        "img",
        "math",
        "object",
        "picture",
        "svg",
        "video",
    }
)

# Interactive content - elements for user interaction
INTERACTIVE_CONTENT = frozenset(
    {
        "a",
        "audio",
        "button",
        "details",
        "embed",
        "iframe",
        "img",
        "input",
        "label",
        "select",
        "textarea",
        "video",
    }
)

# =============================================================================
# Element-specific child constraints
# Maps parent elements to their allowed children
# =============================================================================

ELEMENT_CHILDREN = {
    # Document structure
    "html": {"head", "body"},
    "head": METADATA_CONTENT,
    "body": FLOW_CONTENT,
    # Lists
    "ul": {"li"},
    "ol": {"li"},
    "dl": {"dt", "dd", "div"},
    "menu": {"li"},
    # Tables
    "table": {"caption", "colgroup", "thead", "tbody", "tfoot", "tr"},
    "thead": {"tr"},
    "tbody": {"tr"},
    "tfoot": {"tr"},
    "tr": {"th", "td"},
    "colgroup": {"col"},
    # Forms
    "select": {"option", "optgroup"},
    "optgroup": {"option"},
    "datalist": {"option"},
    "fieldset": {"legend"} | FLOW_CONTENT,
    # Grouping with special first child
    "figure": {"figcaption"} | FLOW_CONTENT,
    "details": {"summary"} | FLOW_CONTENT,
    # Media
    "picture": {"source", "img"},
    "audio": {"source", "track"} | FLOW_CONTENT,
    "video": {"source", "track"} | FLOW_CONTENT,
    # Other
    "map": {"area"} | FLOW_CONTENT,
    "ruby": {"rt", "rp"} | PHRASING_CONTENT,
}

# All known HTML5 tags (union of all categories plus structural elements)
ALL_TAGS = (
    METADATA_CONTENT
    | FLOW_CONTENT
    | VOID_ELEMENTS
    | {
        "html",
        "head",
        "body",
        "li",
        "dt",
        "dd",
        "caption",
        "colgroup",
        "col",
        "thead",
        "tbody",
        "tfoot",
        "tr",
        "th",
        "td",
        "option",
        "optgroup",
        "legend",
        "figcaption",
        "summary",
        "source",
        "track",
        "area",
        "rt",
        "rp",
    }
)


[docs] class HtmlBuilder(BuilderBase): """Builder for HTML elements. Provides dynamic methods for all HTML tags via __getattr__. Void elements (meta, br, img, etc.) automatically use empty string value. Usage: >>> store = TreeStore(builder=HtmlBuilder()) >>> store.div(id='main').p(value='Hello') >>> store.ul().li(value='Item 1') Categories available as class attributes for reference: - VOID_ELEMENTS - FLOW_CONTENT - PHRASING_CONTENT - etc. """ # Expose categories as class attributes VOID_ELEMENTS = VOID_ELEMENTS METADATA_CONTENT = METADATA_CONTENT FLOW_CONTENT = FLOW_CONTENT PHRASING_CONTENT = PHRASING_CONTENT HEADING_CONTENT = HEADING_CONTENT SECTIONING_CONTENT = SECTIONING_CONTENT EMBEDDED_CONTENT = EMBEDDED_CONTENT INTERACTIVE_CONTENT = INTERACTIVE_CONTENT ELEMENT_CHILDREN = ELEMENT_CHILDREN ALL_TAGS = ALL_TAGS
[docs] def __getattr__(self, name: str) -> Callable[..., TreeStore | TreeStoreNode]: """Dynamic method for any HTML tag. Args: name: Tag name (e.g., 'div', 'span', 'meta') Returns: Callable that creates a child with that tag. Raises: AttributeError: If name is not a valid HTML tag. """ if name.startswith("_"): raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'") if name in ALL_TAGS: return self._make_tag_method(name) raise AttributeError(f"'{name}' is not a valid HTML tag")
def _make_tag_method(self, name: str) -> Callable[..., TreeStore | TreeStoreNode]: """Create a method for a specific tag.""" is_void = name in VOID_ELEMENTS def tag_method( target: TreeStore, tag: str = name, value: Any = None, **attr: Any ) -> TreeStore | TreeStoreNode: # Void elements get empty string value (self-closing) if is_void and value is None: value = "" return self.child(target, tag, value=value, **attr) return tag_method
class HtmlHeadBuilder(HtmlBuilder): """Builder for HTML head section. Allows all HTML tags but semantically intended for head content (meta, title, link, style, script, etc.) """ pass class HtmlBodyBuilder(HtmlBuilder): """Builder for HTML body section. Allows all HTML tags for body content generation. """ pass class HtmlPage: """HTML page with separate head and body TreeStores. Creates a complete HTML document structure with: - html root TreeStore - head TreeStore with HtmlHeadBuilder (metadata only) - body TreeStore with HtmlBodyBuilder (flow content) Usage: >>> page = HtmlPage() >>> page.head.title(value='My Page') >>> page.head.meta(charset='utf-8') >>> page.body.div(id='main').p(value='Hello World') >>> html = page.to_html() """ def __init__(self): """Initialize the page with head and body.""" from ..store import TreeStore self.html = TreeStore() self.head = TreeStore(builder=HtmlHeadBuilder()) self.body = TreeStore(builder=HtmlBodyBuilder()) self.html.set_item("head", self.head) self.html.set_item("body", self.body) def _node_to_html(self, node: TreeStoreNode, indent: int = 0) -> str: """Recursively convert a node to HTML.""" tag = node.tag or node.label attrs = " ".join(f'{k}="{v}"' for k, v in node.attr.items() if not k.startswith("_")) attrs_str = f" {attrs}" if attrs else "" spaces = " " * indent if node.is_leaf: if node.value == "": return f"{spaces}<{tag}{attrs_str}>" return f"{spaces}<{tag}{attrs_str}>{node.value}</{tag}>" lines = [f"{spaces}<{tag}{attrs_str}>"] for child in node.value.nodes(): lines.append(self._node_to_html(child, indent + 1)) lines.append(f"{spaces}</{tag}>") return "\n".join(lines) def _store_to_html(self, store: TreeStore, tag: str, indent: int = 0) -> str: """Convert a TreeStore to HTML with a wrapper tag.""" spaces = " " * indent lines = [f"{spaces}<{tag}>"] for node in store.nodes(): lines.append(self._node_to_html(node, indent + 1)) lines.append(f"{spaces}</{tag}>") return "\n".join(lines) def to_html(self, filename: str | None = None, output_dir: str | None = None) -> str: """Generate complete HTML. Args: filename: If provided, save to output_dir/filename output_dir: Directory to save to (default: current directory) Returns: HTML string, or path if filename was provided """ from pathlib import Path html_lines = [ "<!DOCTYPE html>", "<html>", self._store_to_html(self.head, "head", indent=0), self._store_to_html(self.body, "body", indent=0), "</html>", ] html_content = "\n".join(html_lines) if filename: if output_dir is None: output_dir = Path.cwd() else: output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True) output_path = output_dir / filename output_path.write_text(html_content) return str(output_path) return html_content def print_tree(self): """Print the tree structure for debugging.""" print("=" * 60) print("HEAD") print("=" * 60) for path, node in self.head.walk(): indent_level = " " * path.count(".") tag = node.tag or node.label value_str = "" if node.is_leaf and node.value: val = str(node.value) value_str = f': "{val[:30]}..."' if len(val) > 30 else f': "{val}"' print(f"{indent_level}<{tag}>{value_str}") print("\n" + "=" * 60) print("BODY") print("=" * 60) for path, node in self.body.walk(): indent_level = " " * path.count(".") tag = node.tag or node.label value_str = f': "{node.value}"' if node.is_leaf and node.value else "" attrs = " ".join(f'{k}="{v}"' for k, v in node.attr.items() if not k.startswith("_")) attrs_str = f" [{attrs}]" if attrs else "" print(f"{indent_level}<{tag}{attrs_str}>{value_str}")