to guess a country name and maps it to an ISO alpha-2 code using the provided JSON. """ if not iso_path.exists(): return svg try: with iso_path.open(encoding="utf-8") as fh: mapping = json.load(fh) except Exception: return svg norm_map = { normalize_name(v): k.upper() for k, v in mapping.items() if v } # Prefer an XML-aware edit: parse and modify elements, then serialize. try: root = ET.fromstring(svg) # detect and register default namespace if present so ET doesn't # emit ns0 prefixes when serializing ns_uri = None if isinstance(root.tag, str) and root.tag.startswith('{'): ns_uri = root.tag.split('}')[0].strip('{') elif 'xmlns' in root.attrib: ns_uri = root.attrib.get('xmlns') if ns_uri: ET.register_namespace('', ns_uri) # iterate over all elements and handle local tag names (ignore namespace) for elem in root.iter(): tag = elem.tag local = tag.split('}', 1)[1] if '}' in tag else tag if local not in ('path', 'g'): continue if 'data-iso' in elem.attrib: continue # candidate sources cand = elem.get('id') or elem.get('name') or elem.get('data-name') if not cand: title = None for child in elem: ctag = child.tag c_local = ctag.split('}', 1)[1] if '}' in ctag else ctag if c_local == 'title' and child.text: title = child.text break cand = title if not cand: continue code = norm_map.get(normalize_name(cand)) if code: elem.set('data-iso', code) # serialize back to a string; namespace registration prevents ns0 prefixes return ET.tostring(root, encoding='unicode') except Exception: # fallback: keep the original regex-based approach (conservative) # non-self-closing def repl_pair(m): start, attrs, inner = m.group(1), m.group(2), m.group(3) if re.search(r'data-iso\s*=\s*"[^\"]*"', attrs, flags=re.IGNORECASE): return m.group(0) cand = None for pat in (r'id\s*=\s*"([^\"]*)"', r'name\s*=\s*"([^\"]*)"', r'data-name\s*=\s*"([^\"]*)"'): mm = re.search(pat, attrs, flags=re.IGNORECASE) if mm: cand = mm.group(1) break if not cand: t = re.search(r"(.*?)", inner, flags=re.IGNORECASE | re.DOTALL) if t: cand = t.group(1) if not cand: return m.group(0) code = norm_map.get(normalize_name(cand)) if not code: return m.group(0) tag = start[1:] attrs_str = attrs.strip() mid = f" {attrs_str}" if attrs_str else "" return f"{start}{mid} data-iso=\"{code}\">{inner}" svg = re.sub(r"(<(?:path|g)\b)([^>]*?)>(.*?)", repl_pair, svg, flags=re.IGNORECASE | re.DOTALL) # self-closing def repl_self(m): start, attrs, tail = m.group(1), m.group(2), m.group(3) if re.search(r'data-iso\s*=\s*"[^\"]*"', attrs, flags=re.IGNORECASE): return m.group(0) cand = None for pat in (r'id\s*=\s*"([^\"]*)"', r'name\s*=\s*"([^\"]*)"', r'data-name\s*=\s*"([^\"]*)"'): mm = re.search(pat, attrs, flags=re.IGNORECASE) if mm: cand = mm.group(1) break if not cand: return m.group(0) code = norm_map.get(normalize_name(cand)) if not code: return m.group(0) attrs_str = attrs.strip() mid = f" {attrs_str}" if attrs_str else "" return f"{start}{mid} data-iso=\"{code}\"{tail}" svg = re.sub(r"(<(?:path|g)\b)([^>]*?)(/>)", repl_self, svg, flags=re.IGNORECASE | re.DOTALL) return svg # ---------------------- main flow ---------------------- def main(argv=None): """CLI entrypoint: parse arguments, run the pipeline and optionally write the file.""" parser = argparse.ArgumentParser() parser.add_argument("--in-place", action="store_true") parser.add_argument("--file", type=Path, default=FILE_PATH) args = parser.parse_args(argv) svg_path = args.file if not svg_path.exists(): print("SVG not found:", svg_path) return 2 original = read_text(svg_path) # extract text blocks to protect them text_blocks = [] def extract_text(s): """Extract ... blocks and replace them with unique markers.""" nonlocal text_blocks pat = re.compile(r"(]*>.*?)", flags=re.IGNORECASE | re.DOTALL) def r(m): idx = len(text_blocks) text_blocks.append(m.group(1)) return f"" return pat.sub(r, s) def restore_text(s): """Restore previously extracted blocks back into the SVG string.""" for i, b in enumerate(text_blocks): s = s.replace(f"", b) return s svg = extract_text(original) steps = [ (extract_inner_svg, "extract_inner_svg"), (add_svg_attributes, "add_svg_attributes"), (collapse_path_tags, "collapse_path_tags"), (remove_defs, "remove_defs"), (lambda s: add_data_iso(s, ISO_JSON), "add_data_iso"), (remove_data_geo, "remove_data_geo"), (remove_original_strokewidth, "remove_original_strokewidth"), (uppercase_data_iso, "uppercase_data_iso"), (clear_fill_stroke, "clear_fill_stroke"), (remove_empty_groups, "remove_empty_groups"), (lambda s: s, "noop_compact"), ] last_good = svg for func, name in steps: print(f"[stage] {name}") try: svg = func(svg) except Exception as e: print(f"ERROR in {name}: {e}") svg = last_good break ok, msg = validate_xml(svg) if not ok: print(f"Invalid XML after {name}: {msg}") svg = last_good break last_good = svg # pretty print print("[stage] pretty_format") try: dom = minidom.parseString(svg) # remove whitespace-only text nodes to avoid accumulating blank lines strip_whitespace_text_nodes(dom) pretty = dom.toprettyxml(indent=" ") # remove xml decl if present if pretty.startswith("

#!/usr/bin/env python3 """ SVG cleanup utility for worldmap.svg This script performs a series of conservative, text-based transforms on an SVG file to normalize path tags, strip unwanted attributes, add data-iso attributes when possible, and pretty-print the result. Usage: python3 scripts/cleanup_worldmap.py [--in-place] By default the script does a dry run and prints a small preview. Use --in-place to overwrite the file (a timestamped backup will be created). """ from pathlib import Path import re import sys import argparse import datetime from xml.dom import minidom from xml.parsers.expat import ExpatError import xml.etree.ElementTree as ET from xml.dom import Node import json import unicodedata FILE_PATH = Path("public/data/worldmap.svg") ISO_JSON = FILE_PATH.parent / "ISO3166-1.json" def read_text(p: Path) -> str: """Read a text file using UTF-8 and return its contents as a string.""" return p.read_text(encoding="utf-8") def write_text(p: Path, s: str) -> None: """Write the given string to path using UTF-8 encoding.""" p.write_text(s, encoding="utf-8") def backup(p: Path) -> Path: """Create a timestamped backup of path and return the backup Path.""" ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") bak = p.with_suffix(p.suffix + f".bak.{ts}") bak.write_text(p.read_text(encoding="utf-8"), encoding="utf-8") return bak def validate_xml(s: str) -> (bool, str): """Validate that the provided string is well-formed XML using minidom. Returns (True, "OK") on success or (False, error_message) on failure. """ try: minidom.parseString(s) return True, "OK" except ExpatError as e: return False, str(e) except Exception as e: return False, str(e) def normalize_name(s: str) -> str: """Normalize a country name for loose matching. Removes diacritics, lowercases, replaces punctuation with spaces and collapses runs of whitespace. """ if not s: return "" s = unicodedata.normalize("NFKD", s) s = "".join(ch for ch in s if not unicodedata.combining(ch)) s = s.lower() s = s.replace("&", " and ") s = re.sub(r"[^a-z0-9]+", " ", s) return re.sub(r"\s+", " ", s).strip() def extract_inner_svg(svg: str) -> str: """If the file contains a nested