"""
Scrape mineral data from Wikipedia for the Daily Stone feature.

Usage:
    python manage.py scrape_minerals          # scrape all ~365 minerals
    python manage.py scrape_minerals --limit 10  # scrape first 10 only
    python manage.py scrape_minerals --dry-run   # just list names, don't save
"""
import re
import time
import hashlib
import json

import requests
from bs4 import BeautifulSoup
from django.core.management.base import BaseCommand

from dailystone.models import Mineral

# Curated list of well-known, visually interesting minerals with approximate colors.
# Color hex is a rough representative of the mineral's typical appearance.
MINERAL_LIST = [
    ("Quartz", "#f5f5f5"),
    ("Amethyst", "#9b59b6"),
    ("Rose quartz", "#f4a7b9"),
    ("Citrine (quartz)", "#f0c420"),
    ("Diamond", "#e8e8e8"),
    ("Ruby", "#e0115f"),
    ("Sapphire", "#0f52ba"),
    ("Emerald", "#50c878"),
    ("Topaz", "#ffc87c"),
    ("Opal", "#a8c3bc"),
    ("Turquoise (mineral)", "#40e0d0"),
    ("Garnet", "#733635"),
    ("Peridot", "#b4c424"),
    ("Aquamarine", "#7fffd4"),
    ("Tanzanite", "#4d5ba8"),
    ("Malachite", "#0bda51"),
    ("Lapis lazuli", "#26619c"),
    ("Jade", "#00a86b"),
    ("Obsidian", "#3d3635"),
    ("Pyrite", "#c5a647"),
    ("Hematite", "#5c5858"),
    ("Magnetite", "#353535"),
    ("Calcite", "#f5deb3"),
    ("Fluorite", "#7b68ee"),
    ("Apatite", "#509987"),
    ("Beryl", "#c1f0c1"),
    ("Spinel", "#ff4040"),
    ("Zircon", "#c4b19e"),
    ("Alexandrite", "#568c4c"),
    ("Tourmaline", "#86c67c"),
    ("Moonstone", "#c5cfe0"),
    ("Sunstone", "#e07020"),
    ("Labradorite", "#5678a0"),
    ("Rhodonite", "#e87ea1"),
    ("Rhodochrosite", "#e55b6e"),
    ("Azurite", "#2d5da1"),
    ("Chrysocolla", "#4cb9a0"),
    ("Cuprite", "#a52a2a"),
    ("Dioptase", "#209d7d"),
    ("Wulfenite", "#e68a00"),
    ("Vanadinite", "#cc3333"),
    ("Crocoite", "#e74c3c"),
    ("Realgar", "#e34234"),
    ("Orpiment", "#e9a820"),
    ("Stibnite", "#708090"),
    ("Galena", "#6b6e70"),
    ("Cinnabar", "#e44d2e"),
    ("Barite", "#c8c8c0"),
    ("Celestine (mineral)", "#a8d8ea"),
    ("Gypsum", "#f0ece2"),
    ("Halite", "#f0f0f0"),
    ("Sylvite", "#e0c0a0"),
    ("Sulfur", "#edda09"),
    ("Copper", "#b87333"),
    ("Gold", "#ffd700"),
    ("Silver", "#c0c0c0"),
    ("Platinum", "#e5e4e2"),
    ("Bismuth", "#969696"),
    ("Antimony", "#7b8c8a"),
    ("Arsenic", "#808080"),
    ("Graphite", "#474747"),
    ("Corundum", "#d9413c"),
    ("Spodumene", "#d8bfd8"),
    ("Kunzite", "#e6a8d7"),
    ("Hiddenite", "#98fb98"),
    ("Chrysoberyl", "#e8d44d"),
    ("Nephrite", "#638b57"),
    ("Jadeite", "#00a86b"),
    ("Serpentine subgroup", "#6b8e23"),
    ("Chalcopyrite", "#b8860b"),
    ("Bornite", "#8b6914"),
    ("Covellite", "#4169e1"),
    ("Molybdenite", "#6e6e6e"),
    ("Sphalerite", "#a0522d"),
    ("Wurtzite", "#8b4513"),
    ("Cassiterite", "#5c4033"),
    ("Rutile", "#b22222"),
    ("Anatase", "#4682b4"),
    ("Brookite", "#8b5e3b"),
    ("Ilmenite", "#404040"),
    ("Goethite", "#7b6b3a"),
    ("Limonite", "#9a7b4f"),
    ("Siderite", "#8b7d6b"),
    ("Magnesite", "#ede6d6"),
    ("Dolomite", "#dfc8a8"),
    ("Aragonite", "#faebd7"),
    ("Smithsonite", "#7ec8c8"),
    ("Cerussite", "#c8c8c0"),
    ("Witherite", "#e8e0d8"),
    ("Strontianite", "#c8d8c0"),
    ("Ankerite", "#c8b890"),
    ("Olivine", "#9ab973"),
    ("Forsterite", "#96be50"),
    ("Fayalite", "#6b5c3e"),
    ("Augite", "#2e4032"),
    ("Diopside", "#507856"),
    ("Enstatite", "#908870"),
    ("Hypersthene", "#5b5e4e"),
    ("Wollastonite", "#e8e0d8"),
    ("Tremolite", "#e0e8d8"),
    ("Actinolite", "#2d8b57"),
    ("Hornblende", "#3b4838"),
    ("Glaucophane", "#5b6db8"),
    ("Riebeckite", "#2f4f4f"),
    ("Muscovite", "#d4c48d"),
    ("Biotite", "#4a3c28"),
    ("Phlogopite", "#c4a35a"),
    ("Lepidolite", "#c8a2c8"),
    ("Talc", "#e8e8e0"),
    ("Kaolinite", "#f0e8d8"),
    ("Montmorillonite", "#c8b090"),
    ("Vermiculite", "#b89c78"),
    ("Chlorite group", "#6b8f47"),
    ("Prehnite", "#c8e8a0"),
    ("Epidote", "#7b8b2e"),
    ("Zoisite", "#6b8b73"),
    ("Clinozoisite", "#7b9b6b"),
    ("Vesuvianite", "#6b8040"),
    ("Pumpellyite", "#447744"),
    ("Lawsonite", "#8090a0"),
    ("Andalusite", "#b08080"),
    ("Sillimanite", "#c8c0b8"),
    ("Kyanite", "#5b8fbe"),
    ("Staurolite", "#7b5b3b"),
    ("Cordierite", "#6666aa"),
    ("Sodalite", "#3c578e"),
    ("Lazurite", "#26619c"),
    ("Hauyne", "#4466bb"),
    ("Leucite", "#d8d0c8"),
    ("Nepheline", "#c8c0a8"),
    ("Scapolite", "#d0c8b0"),
    ("Danburite", "#e8e0d0"),
    ("Datolite", "#d4e8d0"),
    ("Titanite", "#b8a048"),
    ("Dumortierite", "#4060a0"),
    ("Hemimorphite", "#98d8e8"),
    ("Willemite", "#70b020"),
    ("Phenakite", "#e0e0d8"),
    ("Euclase", "#80b8d8"),
    ("Bertrandite", "#d8d0c0"),
    ("Chrysoprase", "#79a868"),
    ("Carnelian", "#b5462a"),
    ("Jasper", "#ce4a2f"),
    ("Agate", "#b0a090"),
    ("Onyx", "#353839"),
    ("Chalcedony", "#c8d0d8"),
    ("Tiger's eye", "#b8860b"),
    ("Hawk's eye", "#4c6c8c"),
    ("Bloodstone", "#3b6e3f"),
    ("Aventurine", "#568b52"),
    ("Amazonite", "#4c8c7a"),
    ("Larvikite", "#4a5060"),
    ("Charoite", "#7b4e8a"),
    ("Sugilite", "#8b4789"),
    ("Larimar", "#88c8de"),
    ("Pietersite", "#4a5c40"),
    ("Moldavite", "#6b8e23"),
    ("Tektite", "#3a3a3a"),
    ("Shungite", "#2c2c2c"),
    ("Seraphinite", "#4a7c5a"),
    ("Astrophyllite", "#8b6c2a"),
    ("Nuummite", "#3a3a3a"),
    ("Howlite", "#e8e0d8"),
    ("Magnesite", "#ede6d6"),
    ("Sodalite", "#3c578e"),
    ("Unakite", "#7a8a5a"),
    ("Variscite", "#50b848"),
    ("Wavellite", "#78b868"),
    ("Vivianite", "#2e5e8e"),
    ("Erythrite", "#d84888"),
    ("Annabergite", "#58b858"),
    ("Adamite", "#b8e830"),
    ("Legrandite", "#e8d830"),
    ("Aurichalcite", "#78c8b8"),
    ("Rosasite", "#58a8a8"),
    ("Hemimorphite", "#98d8e8"),
    ("Cavansite", "#3070c8"),
    ("Pentlandite", "#b8a830"),
    ("Millerite", "#b8a040"),
    ("Nickeline", "#c8a088"),
    ("Skutterudite", "#808080"),
    ("Cobaltite", "#808888"),
    ("Arsenopyrite", "#808888"),
    ("Marcasite", "#c0b838"),
    ("Pyrrhotite", "#a09048"),
    ("Pentlandite", "#b8a830"),
    ("Chromite", "#404040"),
    ("Spessartine", "#e86838"),
    ("Almandine", "#a03050"),
    ("Pyrope", "#c82040"),
    ("Grossular", "#80b840"),
    ("Andradite", "#686830"),
    ("Uvarovite", "#388838"),
    ("Tsavorite", "#38a848"),
    ("Demantoid", "#58a838"),
    ("Melanite", "#303030"),
    ("Topazolite", "#d8c838"),
    ("Schorl", "#2c2c2c"),
    ("Elbaite", "#48b888"),
    ("Dravite", "#8b6c3a"),
    ("Indicolite", "#287888"),
    ("Rubellite", "#c83868"),
    ("Paraiba tourmaline", "#00b8c8"),
    ("Watermelon tourmaline", "#78b858"),
    ("Tephroite", "#7a6a50"),
    ("Rhodolite", "#c84878"),
    ("Iolite", "#5858a8"),
    ("Scolecite", "#e8e0e0"),
    ("Natrolite", "#e0e0d0"),
    ("Stilbite", "#e8b898"),
    ("Heulandite", "#e0c090"),
    ("Apophyllite", "#c8e8d0"),
    ("Analcime", "#e0e0d0"),
    ("Chabazite", "#e0c898"),
    ("Phillipsite", "#d0c8b0"),
    ("Thomsonite", "#d8d0c0"),
    ("Mesolite", "#e8e0d8"),
    ("Laumontite", "#e0d0b0"),
    ("Mordenite", "#e0d8c8"),
    ("Clinoptilolite", "#d8d0c0"),
    ("Erionite", "#e0d8d0"),
    ("Colemanite", "#e0d8c8"),
    ("Ulexite", "#f0e8e0"),
    ("Borax", "#e8e0d8"),
    ("Kernite", "#e0d8d0"),
    ("Tincalconite", "#e8e0d8"),
    ("Sassolite", "#e0e0d0"),
    ("Boracite", "#c8d8c0"),
    ("Sinhalite", "#c0a870"),
    ("Kornerupine", "#587848"),
    ("Grandidierite", "#4898a8"),
    ("Serendibite", "#384838"),
    ("Taaffeite", "#c888c8"),
    ("Painite", "#a06040"),
    ("Musgravite", "#808878"),
    ("Jeremejevite", "#a8c8e0"),
    ("Poudretteite", "#e8c0d8"),
    ("Benitoite", "#3858c8"),
    ("Neptunite", "#383028"),
    ("Joaquinite", "#a88030"),
    ("Sanbornite", "#e0d8c8"),
    ("Fresnoite", "#e0d030"),
    ("Celsian", "#d8d0c0"),
    ("Hyalophane", "#d0c8b0"),
    ("Harmotome", "#d8d0c0"),
    ("Pectolite", "#d0e0e0"),
    ("Okenite", "#f0e8e0"),
    ("Gyrolite", "#e0e8d0"),
    ("Tobermorite", "#d8d0c8"),
    ("Xonotlite", "#e0d8d0"),
    ("Thaumasite", "#e8e0d8"),
    ("Ettringite", "#e8e838"),
    ("Sturmanite", "#e8e030"),
    ("Charlesite", "#e0e0c8"),
    ("Afwillite", "#e0d8d0"),
    ("Hillebrandite", "#e0d8c8"),
    ("Foshagite", "#e0e0d0"),
    ("Jennite", "#d8c8b8"),
    ("Suolunite", "#d8d0c0"),
    ("Rosenbuschite", "#c8a070"),
    ("Eudialyte", "#c84860"),
    ("Catapleiite", "#c8c0b0"),
    ("Lorenzenite", "#584838"),
    ("Ramsayite", "#685838"),
    ("Lamprophyllite", "#a08030"),
    ("Murmanite", "#907050"),
    ("Lomonosovite", "#886040"),
    ("Vuonnemite", "#b89050"),
    ("Villiaumite", "#e8a030"),
    ("Ussingite", "#c8a0b8"),
    ("Chkalovite", "#d8d0c8"),
    ("Tugtupite", "#e0586e"),
    ("Sorensenite", "#d8d0c8"),
    ("Tinguaite", "#586850"),
    ("Cancrinite", "#e0c030"),
    ("Vishnevite", "#c8c0a0"),
    ("Davyne", "#d0c890"),
    ("Microsommite", "#d0c890"),
    ("Nosean", "#707888"),
    ("Hackmanite", "#9870a0"),
    ("Tugtupite", "#e0586e"),
    ("Pargasite", "#386838"),
    ("Edenite", "#507848"),
    ("Kaersutite", "#483830"),
    ("Richterite", "#586880"),
    ("Winchite", "#607060"),
    ("Barroisite", "#506860"),
    ("Gedrite", "#606058"),
    ("Anthophyllite", "#807860"),
    ("Cummingtonite", "#787068"),
    ("Grunerite", "#686058"),
    ("Holmquistite", "#5868a0"),
    ("Sapphirine", "#4060a8"),
    ("Kornerupine", "#587848"),
    ("Prismatine", "#586048"),
    ("Boralsilite", "#d0c8b8"),
    ("Werdingite", "#a8a098"),
    ("Grandidierite", "#4898a8"),
    ("Ominelite", "#404038"),
    ("Serendibite", "#384838"),
    ("Sinhalite", "#c0a870"),
    ("Taafeite", "#c888c8"),
    ("Musgravite", "#808878"),
    ("Johachidolite", "#e0c070"),
    ("Painite", "#a06040"),
    ("Jeremejevite", "#a8c8e0"),
    ("Poudretteite", "#e8c0d8"),
    ("Benitoite", "#3858c8"),
    ("Neptunite", "#383028"),
    ("Joaquinite", "#a88030"),
    ("Sanbornite", "#e0d8c8"),
    ("Howlite", "#e8e0d8"),
    ("Magnesite", "#ede6d6"),
    ("Selenite (mineral)", "#f0eee0"),
    ("Desert rose (crystal)", "#d8b890"),
    ("Fulgurite", "#c8b898"),
    ("Tektite", "#3a3a3a"),
    ("Meteorite", "#686058"),
    ("Pallasite", "#a09048"),
    ("Kamacite", "#909090"),
    ("Taenite", "#a0a0a0"),
    ("Troilite", "#886838"),
    ("Schreibersite", "#a0a098"),
    ("Cohenite", "#686060"),
    ("Moissanite", "#b8e8c8"),
    ("Lonsdaleite", "#c8c8c0"),
    ("Stishovite", "#d0d0c8"),
    ("Coesite", "#c8c8c0"),
    ("Seifertite", "#c0c0b8"),
    ("Ringwoodite", "#5878c8"),
    ("Bridgmanite", "#a0a098"),
    ("Davemaoite", "#a8a0a0"),
    ("Ice", "#e0f0f8"),
    ("Dry ice", "#e8e8f0"),
    ("Sal ammoniac", "#e0e0d8"),
    ("Niter", "#e8e0d8"),
    ("Natron", "#e0d8c8"),
    ("Trona", "#d8d0c0"),
    ("Thermonatrite", "#e0d8d0"),
    ("Gaylussite", "#d8d0c0"),
    ("Pirssonite", "#d0c8c0"),
    ("Shortite", "#e0d838"),
    ("Northupite", "#d8d0c0"),
    ("Eitelite", "#d0c8c0"),
    ("Bradleyite", "#c8c0b0"),
    ("Tychite", "#c8c0b0"),
    ("Schairerite", "#d0c8b8"),
    ("Sulfohalite", "#d0c8b8"),
    ("Kogarkoite", "#d0c8c0"),
]

# Deduplicate by name (keep first occurrence)
_seen = set()
_deduped = []
for name, color in MINERAL_LIST:
    key = name.lower()
    if key not in _seen:
        _seen.add(key)
        _deduped.append((name, color))
MINERAL_LIST = _deduped

SESSION = None


def get_session():
    global SESSION
    if SESSION is None:
        SESSION = requests.Session()
        SESSION.headers.update({
            'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
        })
    return SESSION


def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
    """Make a request with backoff on 429 errors, respecting Retry-After."""
    for attempt in range(max_retries):
        resp = session.get(url, params=params, timeout=timeout)
        if resp.status_code == 429:
            retry_after = resp.headers.get('Retry-After')
            if retry_after and retry_after.isdigit():
                wait = min(int(retry_after) + 1, 120)  # Cap at 2 minutes
            else:
                wait = 10 * (2 ** attempt)  # 10, 20, 40, 80, 160
            time.sleep(wait)
            continue
        resp.raise_for_status()
        return resp
    resp.raise_for_status()
    return resp


def get_wikipedia_page(title):
    """Fetch parsed Wikipedia page via the API."""
    session = get_session()
    resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
        'action': 'parse',
        'page': title,
        'prop': 'text|images',
        'format': 'json',
        'redirects': 1,
    })
    data = resp.json()
    if 'error' in data:
        return None
    return data['parse']


def get_image_urls(parse_data, limit=4):
    """Get actual image URLs from the parsed page's image list."""
    session = get_session()
    images = parse_data.get('images', [])
    # Filter out icons, logos, SVGs
    good = [
        img for img in images
        if not any(skip in img.lower() for skip in [
            'icon', 'logo', 'symbol', 'flag', 'commons-logo', 'wiki',
            'question_mark', 'edit-clear', 'ambox', 'crystal_clear',
            'lock-', 'padlock', 'red_pencil', 'text-', 'globe_',
            'folder_', 'nuvola', 'gnome-', 'information', '.svg',
            'wiktionary', 'disambig', 'merge-', 'split-', 'portal-',
        ])
    ]
    if not good:
        good = [img for img in images if img.lower().endswith(('.jpg', '.jpeg', '.png'))]

    urls = []
    for img_name in good[:limit * 2]:
        try:
            resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
                'action': 'query',
                'titles': f'File:{img_name}',
                'prop': 'imageinfo',
                'iiprop': 'url|size',
                'iiurlwidth': 800,
                'format': 'json',
            }, timeout=15)
            pages = resp.json()['query']['pages']
            for page in pages.values():
                if 'imageinfo' in page:
                    info = page['imageinfo'][0]
                    thumb = info.get('thumburl', info.get('url', ''))
                    if thumb:
                        urls.append(thumb)
                        if len(urls) >= limit:
                            return urls
        except Exception:
            continue
    return urls


def extract_infobox(soup):
    """Extract key-value pairs from a mineral infobox."""
    info = {}
    table = soup.find('table', class_='infobox')
    if not table:
        return info

    for row in table.find_all('tr'):
        th = row.find('th')
        td = row.find('td')
        if th and td:
            key = th.get_text(strip=True).lower()
            val = td.get_text(' ', strip=True)
            info[key] = val
    return info


FIELD_MAPPINGS = {
    'formula': [
        'formula', 'chemical formula', 'idealformula',
        'formula(repeating unit)', 'chemical', 'composition',
    ],
    'category': ['category', 'mineral class', 'classification', 'group'],
    'crystal_system': ['crystal system', 'crystalsystem', 'crystal class', 'system'],
    'mohs_hardness': ['mohs scalehardness', 'mohs scale hardness', 'hardness', 'mohs hardness'],
    'luster': ['luster', 'lustre', 'luster (mineralogy)'],
    'streak': ['streak', 'streak color'],
    'specific_gravity': ['specific gravity', 'density', 'specificgravity', 'relative density'],
    'color_description': ['color', 'colour', 'color/pleochroism'],
}


def match_field(info, candidates):
    """Find the first matching key from candidates in the info dict."""
    for c in candidates:
        for key, val in info.items():
            if c in key:
                return val
    return ''


def _clean_text(text):
    """Remove citation marks and normalize whitespace."""
    text = re.sub(r'\[[\d,\s]+\]', '', text)
    text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[clarification needed\]', '', text, flags=re.IGNORECASE)
    # Normalize whitespace (collapse multiple spaces, fix space before punctuation)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
    text = re.sub(r'(\()\s+', r'\1', text)
    return text.strip()


def _find_heading_wrapper(tag):
    """Return the wrapper div if the heading is inside mw-heading, else the tag itself."""
    parent = tag.parent
    if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
        return parent
    return tag


def extract_description(soup):
    """Get the first few paragraphs of the article (before any section heading)."""
    paragraphs = []
    for p in soup.find_all('p'):
        text = p.get_text(' ', strip=True)
        if len(text) > 50:
            paragraphs.append(_clean_text(text))
            if len(paragraphs) >= 3:
                break
    return '\n\n'.join(paragraphs)


def _collect_section_paragraphs(start_element, max_paras=2):
    """Collect paragraphs after a heading element until the next heading."""
    parts = []
    heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
    sibling = start_element.find_next_sibling()
    while sibling:
        # Stop at next heading (div.mw-heading or bare h2/h3)
        if sibling.name in ['h2', 'h3']:
            break
        if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
            break
        if sibling.name == 'p':
            text = sibling.get_text(' ', strip=True)
            if len(text) > 30:
                parts.append(_clean_text(text))
                if len(parts) >= max_paras:
                    break
        sibling = sibling.find_next_sibling()
    return parts


def extract_history(soup):
    """Try to find history/etymology section."""
    history_headers = ['history', 'etymology', 'discovery', 'naming', 'occurrence']

    # Search both bare headings and headings inside mw-heading divs
    for header_tag in soup.find_all(['h2', 'h3']):
        header_text = header_tag.get_text(strip=True).lower()
        header_text = re.sub(r'\[edit\]$', '', header_text).strip()
        if any(h in header_text for h in history_headers):
            wrapper = _find_heading_wrapper(header_tag)
            parts = _collect_section_paragraphs(wrapper)
            if parts:
                return '\n\n'.join(parts)
    return ''


class Command(BaseCommand):
    help = 'Scrape mineral data from Wikipedia'

    def add_arguments(self, parser):
        parser.add_argument('--limit', type=int, default=0, help='Max minerals to scrape (0 = all)')
        parser.add_argument('--dry-run', action='store_true', help='List minerals without saving')
        parser.add_argument('--skip-existing', action='store_true', help='Skip already-saved minerals')

    def handle(self, *args, **options):
        limit = options['limit']
        dry_run = options['dry_run']
        skip_existing = options['skip_existing']

        minerals = MINERAL_LIST
        if limit:
            minerals = minerals[:limit]

        self.stdout.write(f'Processing {len(minerals)} minerals...\n')

        success = 0
        skipped = 0
        failed = 0

        for i, (name, color_hex) in enumerate(minerals, 1):
            display_name_check = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip()
            if skip_existing and Mineral.objects.filter(name=display_name_check).exists():
                self.stdout.write(f'  [{i}/{len(minerals)}] SKIP {name} (already exists)')
                skipped += 1
                continue

            if dry_run:
                self.stdout.write(f'  [{i}/{len(minerals)}] {name} ({color_hex})')
                continue

            self.stdout.write(f'  [{i}/{len(minerals)}] Scraping {name}...', ending='')

            try:
                parsed = get_wikipedia_page(name)
                if not parsed:
                    self.stdout.write(self.style.WARNING(' NOT FOUND'))
                    failed += 1
                    continue

                html = parsed['text']['*']
                soup = BeautifulSoup(html, 'html.parser')

                # Skip actual disambiguation pages (they have the dmbox class)
                if soup.find('table', id='disambigbox') or soup.find('div', class_='dmbox'):
                    self.stdout.write(self.style.WARNING(' DISAMBIGUATION - SKIPPED'))
                    failed += 1
                    continue
                info = extract_infobox(soup)

                # Strip Wikipedia disambiguation suffixes from display name
                display_name = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip()

                image_urls = get_image_urls(parsed, limit=4)
                description = extract_description(soup)
                history = extract_history(soup)

                mineral, created = Mineral.objects.update_or_create(
                    name=display_name,
                    defaults={
                        'formula': match_field(info, FIELD_MAPPINGS['formula'])[:200],
                        'category': match_field(info, FIELD_MAPPINGS['category'])[:200],
                        'crystal_system': match_field(info, FIELD_MAPPINGS['crystal_system'])[:200],
                        'mohs_hardness': match_field(info, FIELD_MAPPINGS['mohs_hardness'])[:50],
                        'luster': match_field(info, FIELD_MAPPINGS['luster'])[:200],
                        'streak': match_field(info, FIELD_MAPPINGS['streak'])[:200],
                        'specific_gravity': match_field(info, FIELD_MAPPINGS['specific_gravity'])[:100],
                        'color_description': match_field(info, FIELD_MAPPINGS['color_description'])[:300],
                        'color_hex': color_hex,
                        'description': description,
                        'history': history,
                        'image_urls': image_urls,
                        'wikipedia_url': f'https://en.wikipedia.org/wiki/{name.replace(" ", "_")}',
                        'day_of_year': i,
                    },
                )

                status = 'CREATED' if created else 'UPDATED'
                img_count = len(image_urls)
                self.stdout.write(self.style.SUCCESS(f' {status} ({img_count} images)'))
                success += 1

            except Exception as e:
                self.stdout.write(self.style.ERROR(f' ERROR: {e}'))
                failed += 1

            # Be polite to Wikipedia — ~3s between minerals keeps us under rate limits
            time.sleep(3)

        self.stdout.write(f'\nDone: {success} saved, {skipped} skipped, {failed} failed')