Source code for looker_powerpoint.tools.find_alt_text

from pptx import Presentation
from lxml import etree
import yaml

NS = {"p": "http://schemas.openxmlformats.org/presentationml/2006/main"}

# Mapping of "smart" / non-ASCII quote variants to their ASCII equivalents.
_QUOTE_REPLACEMENTS = {
    # Double-quote variants → straight double quote
    "\u201c": '"',  # LEFT DOUBLE QUOTATION MARK  "
    "\u201d": '"',  # RIGHT DOUBLE QUOTATION MARK "
    "\u201e": '"',  # DOUBLE LOW-9 QUOTATION MARK „
    "\u2033": '"',  # DOUBLE PRIME               ″
    "\u00ab": '"',  # LEFT-POINTING DOUBLE ANGLE  «
    "\u00bb": '"',  # RIGHT-POINTING DOUBLE ANGLE »
    # Single-quote / apostrophe variants → straight single quote
    "\u2018": "'",  # LEFT SINGLE QUOTATION MARK  '
    "\u2019": "'",  # RIGHT SINGLE QUOTATION MARK '
    "\u201a": "'",  # SINGLE LOW-9 QUOTATION MARK ‚
    "\u2032": "'",  # PRIME                       ′
    "\u0060": "'",  # GRAVE ACCENT                `
    "\u00b4": "'",  # ACUTE ACCENT                ´
}

# Build a translation table once for efficient single-pass replacement.
_QUOTE_TABLE = str.maketrans(_QUOTE_REPLACEMENTS)



[docs]
def cleanse_alt_text(text: str) -> str:
    """Normalise alternative-text before YAML parsing.

    Replaces typographic / "smart" quote characters with their plain ASCII
    equivalents so that YAML produced by applications that substitute curly
    quotes (e.g. macOS, Word, PowerPoint) can still be parsed correctly.

    Args:
        text: Raw alternative-text string extracted from a shape.

    Returns:
        The cleansed string with all known fancy quote variants replaced by
        straight ASCII quotes.
    """
    return text.translate(_QUOTE_TABLE)




[docs]
def extract_alt_text(shape):
    """
    Extracts the alternative text description from a shape's XML.

    Args:
        shape: A Shape object from pptx.

    Returns:
        The YAML-parsed value (typically a dict) from the shape's alternative
        text, or None if no ``descr`` attribute is present or the text is empty.
    """
    xml_str = shape.element.xml  # get XML string of the shape element
    xml_elem = etree.fromstring(xml_str)  # parse it into an lxml element
    for path in [
        ".//p:nvSpPr/p:cNvPr",
        ".//p:nvPicPr/p:cNvPr",
        ".//p:nvGraphicFramePr/p:cNvPr",
    ]:
        cNvPr_elements = xml_elem.xpath(path, namespaces=NS)
        if cNvPr_elements:
            descr = cNvPr_elements[0].get("descr")
            if descr:
                data = yaml.safe_load(
                    cleanse_alt_text(descr)
                )  # Use safe_load for untrusted sources

                return data
    return None




[docs]
def get_presentation_objects_with_descriptions(pptx_path):
    """
    Extracts all shapes from a PowerPoint presentation and returns them with descriptions.

    Args:
        pptx_path (str): The path to the PowerPoint presentation file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a shape and
              contains the shape object, its description, and the slide number.
              Returns an empty list if the presentation cannot be opened or has no slides/shapes.
    """
    try:
        presentation = Presentation(pptx_path)
    except Exception as e:
        print(f"Error opening presentation: {e}")
        return []

    objects_with_descriptions = []

    for i, slide in enumerate(presentation.slides, start=0):
        for shape in slide.shapes:
            description = extract_alt_text(shape)  # Generate description

            emu_to_pixels = lambda emu: emu / 9525

            width_px = emu_to_pixels(shape.width)
            height_px = emu_to_pixels(shape.height)

            if description:
                if type(description) is dict and "meta_name" in description:
                    shape_id = description.get("meta_name")
                else:
                    shape_id = (
                        f"{i},{shape.shape_id}"  # Use shape number for identification
                    )

                objects_with_descriptions.append(
                    {
                        "shape_id": shape_id,  # Use shape number for identification
                        "shape_type": shape.shape_type.name,
                        "shape_width": round(width_px),
                        "shape_height": round(height_px),
                        "integration": description,
                        "slide_number": i,  # Use the enumerate index for slide number
                        "shape_number": shape.shape_id,
                    }
                )

    return objects_with_descriptions



if __name__ == "__main__":
    # Example Usage
    pptx_file = "p.pptx"  # Replace with your file path
    objects = get_presentation_objects_with_descriptions(pptx_file)
    from rich import print
Source code for looker_powerpoint.tools.find_alt_text

Looker PowerPoint CLI

Navigation

Related Topics