""" Scrape mineral data from Wikipedia for the Daily Stone feature. Usage: python manage.py scrape_minerals # scrape all ~365 minerals python manage.py scrape_minerals --limit 10 # scrape first 10 only python manage.py scrape_minerals --dry-run # just list names, don't save """ import re import time import hashlib import json import requests from bs4 import BeautifulSoup from django.core.management.base import BaseCommand from dailystone.models import Mineral # Curated list of well-known, visually interesting minerals with approximate colors. # Color hex is a rough representative of the mineral's typical appearance. MINERAL_LIST = [ ("Quartz", "#f5f5f5"), ("Amethyst", "#9b59b6"), ("Rose quartz", "#f4a7b9"), ("Citrine (quartz)", "#f0c420"), ("Diamond", "#e8e8e8"), ("Ruby", "#e0115f"), ("Sapphire", "#0f52ba"), ("Emerald", "#50c878"), ("Topaz", "#ffc87c"), ("Opal", "#a8c3bc"), ("Turquoise (mineral)", "#40e0d0"), ("Garnet", "#733635"), ("Peridot", "#b4c424"), ("Aquamarine", "#7fffd4"), ("Tanzanite", "#4d5ba8"), ("Malachite", "#0bda51"), ("Lapis lazuli", "#26619c"), ("Jade", "#00a86b"), ("Obsidian", "#3d3635"), ("Pyrite", "#c5a647"), ("Hematite", "#5c5858"), ("Magnetite", "#353535"), ("Calcite", "#f5deb3"), ("Fluorite", "#7b68ee"), ("Apatite", "#509987"), ("Beryl", "#c1f0c1"), ("Spinel", "#ff4040"), ("Zircon", "#c4b19e"), ("Alexandrite", "#568c4c"), ("Tourmaline", "#86c67c"), ("Moonstone", "#c5cfe0"), ("Sunstone", "#e07020"), ("Labradorite", "#5678a0"), ("Rhodonite", "#e87ea1"), ("Rhodochrosite", "#e55b6e"), ("Azurite", "#2d5da1"), ("Chrysocolla", "#4cb9a0"), ("Cuprite", "#a52a2a"), ("Dioptase", "#209d7d"), ("Wulfenite", "#e68a00"), ("Vanadinite", "#cc3333"), ("Crocoite", "#e74c3c"), ("Realgar", "#e34234"), ("Orpiment", "#e9a820"), ("Stibnite", "#708090"), ("Galena", "#6b6e70"), ("Cinnabar", "#e44d2e"), ("Barite", "#c8c8c0"), ("Celestine (mineral)", "#a8d8ea"), ("Gypsum", "#f0ece2"), ("Halite", "#f0f0f0"), ("Sylvite", "#e0c0a0"), ("Sulfur", "#edda09"), ("Copper", "#b87333"), ("Gold", "#ffd700"), ("Silver", "#c0c0c0"), ("Platinum", "#e5e4e2"), ("Bismuth", "#969696"), ("Antimony", "#7b8c8a"), ("Arsenic", "#808080"), ("Graphite", "#474747"), ("Corundum", "#d9413c"), ("Spodumene", "#d8bfd8"), ("Kunzite", "#e6a8d7"), ("Hiddenite", "#98fb98"), ("Chrysoberyl", "#e8d44d"), ("Nephrite", "#638b57"), ("Jadeite", "#00a86b"), ("Serpentine subgroup", "#6b8e23"), ("Chalcopyrite", "#b8860b"), ("Bornite", "#8b6914"), ("Covellite", "#4169e1"), ("Molybdenite", "#6e6e6e"), ("Sphalerite", "#a0522d"), ("Wurtzite", "#8b4513"), ("Cassiterite", "#5c4033"), ("Rutile", "#b22222"), ("Anatase", "#4682b4"), ("Brookite", "#8b5e3b"), ("Ilmenite", "#404040"), ("Goethite", "#7b6b3a"), ("Limonite", "#9a7b4f"), ("Siderite", "#8b7d6b"), ("Magnesite", "#ede6d6"), ("Dolomite", "#dfc8a8"), ("Aragonite", "#faebd7"), ("Smithsonite", "#7ec8c8"), ("Cerussite", "#c8c8c0"), ("Witherite", "#e8e0d8"), ("Strontianite", "#c8d8c0"), ("Ankerite", "#c8b890"), ("Olivine", "#9ab973"), ("Forsterite", "#96be50"), ("Fayalite", "#6b5c3e"), ("Augite", "#2e4032"), ("Diopside", "#507856"), ("Enstatite", "#908870"), ("Hypersthene", "#5b5e4e"), ("Wollastonite", "#e8e0d8"), ("Tremolite", "#e0e8d8"), ("Actinolite", "#2d8b57"), ("Hornblende", "#3b4838"), ("Glaucophane", "#5b6db8"), ("Riebeckite", "#2f4f4f"), ("Muscovite", "#d4c48d"), ("Biotite", "#4a3c28"), ("Phlogopite", "#c4a35a"), ("Lepidolite", "#c8a2c8"), ("Talc", "#e8e8e0"), ("Kaolinite", "#f0e8d8"), ("Montmorillonite", "#c8b090"), ("Vermiculite", "#b89c78"), ("Chlorite group", "#6b8f47"), ("Prehnite", "#c8e8a0"), ("Epidote", "#7b8b2e"), ("Zoisite", "#6b8b73"), ("Clinozoisite", "#7b9b6b"), ("Vesuvianite", "#6b8040"), ("Pumpellyite", "#447744"), ("Lawsonite", "#8090a0"), ("Andalusite", "#b08080"), ("Sillimanite", "#c8c0b8"), ("Kyanite", "#5b8fbe"), ("Staurolite", "#7b5b3b"), ("Cordierite", "#6666aa"), ("Sodalite", "#3c578e"), ("Lazurite", "#26619c"), ("Hauyne", "#4466bb"), ("Leucite", "#d8d0c8"), ("Nepheline", "#c8c0a8"), ("Scapolite", "#d0c8b0"), ("Danburite", "#e8e0d0"), ("Datolite", "#d4e8d0"), ("Titanite", "#b8a048"), ("Dumortierite", "#4060a0"), ("Hemimorphite", "#98d8e8"), ("Willemite", "#70b020"), ("Phenakite", "#e0e0d8"), ("Euclase", "#80b8d8"), ("Bertrandite", "#d8d0c0"), ("Chrysoprase", "#79a868"), ("Carnelian", "#b5462a"), ("Jasper", "#ce4a2f"), ("Agate", "#b0a090"), ("Onyx", "#353839"), ("Chalcedony", "#c8d0d8"), ("Tiger's eye", "#b8860b"), ("Hawk's eye", "#4c6c8c"), ("Bloodstone", "#3b6e3f"), ("Aventurine", "#568b52"), ("Amazonite", "#4c8c7a"), ("Larvikite", "#4a5060"), ("Charoite", "#7b4e8a"), ("Sugilite", "#8b4789"), ("Larimar", "#88c8de"), ("Pietersite", "#4a5c40"), ("Moldavite", "#6b8e23"), ("Tektite", "#3a3a3a"), ("Shungite", "#2c2c2c"), ("Seraphinite", "#4a7c5a"), ("Astrophyllite", "#8b6c2a"), ("Nuummite", "#3a3a3a"), ("Howlite", "#e8e0d8"), ("Magnesite", "#ede6d6"), ("Sodalite", "#3c578e"), ("Unakite", "#7a8a5a"), ("Variscite", "#50b848"), ("Wavellite", "#78b868"), ("Vivianite", "#2e5e8e"), ("Erythrite", "#d84888"), ("Annabergite", "#58b858"), ("Adamite", "#b8e830"), ("Legrandite", "#e8d830"), ("Aurichalcite", "#78c8b8"), ("Rosasite", "#58a8a8"), ("Hemimorphite", "#98d8e8"), ("Cavansite", "#3070c8"), ("Pentlandite", "#b8a830"), ("Millerite", "#b8a040"), ("Nickeline", "#c8a088"), ("Skutterudite", "#808080"), ("Cobaltite", "#808888"), ("Arsenopyrite", "#808888"), ("Marcasite", "#c0b838"), ("Pyrrhotite", "#a09048"), ("Pentlandite", "#b8a830"), ("Chromite", "#404040"), ("Spessartine", "#e86838"), ("Almandine", "#a03050"), ("Pyrope", "#c82040"), ("Grossular", "#80b840"), ("Andradite", "#686830"), ("Uvarovite", "#388838"), ("Tsavorite", "#38a848"), ("Demantoid", "#58a838"), ("Melanite", "#303030"), ("Topazolite", "#d8c838"), ("Schorl", "#2c2c2c"), ("Elbaite", "#48b888"), ("Dravite", "#8b6c3a"), ("Indicolite", "#287888"), ("Rubellite", "#c83868"), ("Paraiba tourmaline", "#00b8c8"), ("Watermelon tourmaline", "#78b858"), ("Tephroite", "#7a6a50"), ("Rhodolite", "#c84878"), ("Iolite", "#5858a8"), ("Scolecite", "#e8e0e0"), ("Natrolite", "#e0e0d0"), ("Stilbite", "#e8b898"), ("Heulandite", "#e0c090"), ("Apophyllite", "#c8e8d0"), ("Analcime", "#e0e0d0"), ("Chabazite", "#e0c898"), ("Phillipsite", "#d0c8b0"), ("Thomsonite", "#d8d0c0"), ("Mesolite", "#e8e0d8"), ("Laumontite", "#e0d0b0"), ("Mordenite", "#e0d8c8"), ("Clinoptilolite", "#d8d0c0"), ("Erionite", "#e0d8d0"), ("Colemanite", "#e0d8c8"), ("Ulexite", "#f0e8e0"), ("Borax", "#e8e0d8"), ("Kernite", "#e0d8d0"), ("Tincalconite", "#e8e0d8"), ("Sassolite", "#e0e0d0"), ("Boracite", "#c8d8c0"), ("Sinhalite", "#c0a870"), ("Kornerupine", "#587848"), ("Grandidierite", "#4898a8"), ("Serendibite", "#384838"), ("Taaffeite", "#c888c8"), ("Painite", "#a06040"), ("Musgravite", "#808878"), ("Jeremejevite", "#a8c8e0"), ("Poudretteite", "#e8c0d8"), ("Benitoite", "#3858c8"), ("Neptunite", "#383028"), ("Joaquinite", "#a88030"), ("Sanbornite", "#e0d8c8"), ("Fresnoite", "#e0d030"), ("Celsian", "#d8d0c0"), ("Hyalophane", "#d0c8b0"), ("Harmotome", "#d8d0c0"), ("Pectolite", "#d0e0e0"), ("Okenite", "#f0e8e0"), ("Gyrolite", "#e0e8d0"), ("Tobermorite", "#d8d0c8"), ("Xonotlite", "#e0d8d0"), ("Thaumasite", "#e8e0d8"), ("Ettringite", "#e8e838"), ("Sturmanite", "#e8e030"), ("Charlesite", "#e0e0c8"), ("Afwillite", "#e0d8d0"), ("Hillebrandite", "#e0d8c8"), ("Foshagite", "#e0e0d0"), ("Jennite", "#d8c8b8"), ("Suolunite", "#d8d0c0"), ("Rosenbuschite", "#c8a070"), ("Eudialyte", "#c84860"), ("Catapleiite", "#c8c0b0"), ("Lorenzenite", "#584838"), ("Ramsayite", "#685838"), ("Lamprophyllite", "#a08030"), ("Murmanite", "#907050"), ("Lomonosovite", "#886040"), ("Vuonnemite", "#b89050"), ("Villiaumite", "#e8a030"), ("Ussingite", "#c8a0b8"), ("Chkalovite", "#d8d0c8"), ("Tugtupite", "#e0586e"), ("Sorensenite", "#d8d0c8"), ("Tinguaite", "#586850"), ("Cancrinite", "#e0c030"), ("Vishnevite", "#c8c0a0"), ("Davyne", "#d0c890"), ("Microsommite", "#d0c890"), ("Nosean", "#707888"), ("Hackmanite", "#9870a0"), ("Tugtupite", "#e0586e"), ("Pargasite", "#386838"), ("Edenite", "#507848"), ("Kaersutite", "#483830"), ("Richterite", "#586880"), ("Winchite", "#607060"), ("Barroisite", "#506860"), ("Gedrite", "#606058"), ("Anthophyllite", "#807860"), ("Cummingtonite", "#787068"), ("Grunerite", "#686058"), ("Holmquistite", "#5868a0"), ("Sapphirine", "#4060a8"), ("Kornerupine", "#587848"), ("Prismatine", "#586048"), ("Boralsilite", "#d0c8b8"), ("Werdingite", "#a8a098"), ("Grandidierite", "#4898a8"), ("Ominelite", "#404038"), ("Serendibite", "#384838"), ("Sinhalite", "#c0a870"), ("Taafeite", "#c888c8"), ("Musgravite", "#808878"), ("Johachidolite", "#e0c070"), ("Painite", "#a06040"), ("Jeremejevite", "#a8c8e0"), ("Poudretteite", "#e8c0d8"), ("Benitoite", "#3858c8"), ("Neptunite", "#383028"), ("Joaquinite", "#a88030"), ("Sanbornite", "#e0d8c8"), ("Howlite", "#e8e0d8"), ("Magnesite", "#ede6d6"), ("Selenite (mineral)", "#f0eee0"), ("Desert rose (crystal)", "#d8b890"), ("Fulgurite", "#c8b898"), ("Tektite", "#3a3a3a"), ("Meteorite", "#686058"), ("Pallasite", "#a09048"), ("Kamacite", "#909090"), ("Taenite", "#a0a0a0"), ("Troilite", "#886838"), ("Schreibersite", "#a0a098"), ("Cohenite", "#686060"), ("Moissanite", "#b8e8c8"), ("Lonsdaleite", "#c8c8c0"), ("Stishovite", "#d0d0c8"), ("Coesite", "#c8c8c0"), ("Seifertite", "#c0c0b8"), ("Ringwoodite", "#5878c8"), ("Bridgmanite", "#a0a098"), ("Davemaoite", "#a8a0a0"), ("Ice", "#e0f0f8"), ("Dry ice", "#e8e8f0"), ("Sal ammoniac", "#e0e0d8"), ("Niter", "#e8e0d8"), ("Natron", "#e0d8c8"), ("Trona", "#d8d0c0"), ("Thermonatrite", "#e0d8d0"), ("Gaylussite", "#d8d0c0"), ("Pirssonite", "#d0c8c0"), ("Shortite", "#e0d838"), ("Northupite", "#d8d0c0"), ("Eitelite", "#d0c8c0"), ("Bradleyite", "#c8c0b0"), ("Tychite", "#c8c0b0"), ("Schairerite", "#d0c8b8"), ("Sulfohalite", "#d0c8b8"), ("Kogarkoite", "#d0c8c0"), ] # Deduplicate by name (keep first occurrence) _seen = set() _deduped = [] for name, color in MINERAL_LIST: key = name.lower() if key not in _seen: _seen.add(key) _deduped.append((name, color)) MINERAL_LIST = _deduped SESSION = None def get_session(): global SESSION if SESSION is None: SESSION = requests.Session() SESSION.headers.update({ 'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)' }) return SESSION def _request_with_backoff(session, url, params, timeout=30, max_retries=5): """Make a request with backoff on 429 errors, respecting Retry-After.""" for attempt in range(max_retries): resp = session.get(url, params=params, timeout=timeout) if resp.status_code == 429: retry_after = resp.headers.get('Retry-After') if retry_after and retry_after.isdigit(): wait = min(int(retry_after) + 1, 120) # Cap at 2 minutes else: wait = 10 * (2 ** attempt) # 10, 20, 40, 80, 160 time.sleep(wait) continue resp.raise_for_status() return resp resp.raise_for_status() return resp def get_wikipedia_page(title): """Fetch parsed Wikipedia page via the API.""" session = get_session() resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={ 'action': 'parse', 'page': title, 'prop': 'text|images', 'format': 'json', 'redirects': 1, }) data = resp.json() if 'error' in data: return None return data['parse'] def get_image_urls(parse_data, limit=4): """Get actual image URLs from the parsed page's image list.""" session = get_session() images = parse_data.get('images', []) # Filter out icons, logos, SVGs good = [ img for img in images if not any(skip in img.lower() for skip in [ 'icon', 'logo', 'symbol', 'flag', 'commons-logo', 'wiki', 'question_mark', 'edit-clear', 'ambox', 'crystal_clear', 'lock-', 'padlock', 'red_pencil', 'text-', 'globe_', 'folder_', 'nuvola', 'gnome-', 'information', '.svg', 'wiktionary', 'disambig', 'merge-', 'split-', 'portal-', ]) ] if not good: good = [img for img in images if img.lower().endswith(('.jpg', '.jpeg', '.png'))] urls = [] for img_name in good[:limit * 2]: try: resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={ 'action': 'query', 'titles': f'File:{img_name}', 'prop': 'imageinfo', 'iiprop': 'url|size', 'iiurlwidth': 800, 'format': 'json', }, timeout=15) pages = resp.json()['query']['pages'] for page in pages.values(): if 'imageinfo' in page: info = page['imageinfo'][0] thumb = info.get('thumburl', info.get('url', '')) if thumb: urls.append(thumb) if len(urls) >= limit: return urls except Exception: continue return urls def extract_infobox(soup): """Extract key-value pairs from a mineral infobox.""" info = {} table = soup.find('table', class_='infobox') if not table: return info for row in table.find_all('tr'): th = row.find('th') td = row.find('td') if th and td: key = th.get_text(strip=True).lower() val = td.get_text(' ', strip=True) info[key] = val return info FIELD_MAPPINGS = { 'formula': [ 'formula', 'chemical formula', 'idealformula', 'formula(repeating unit)', 'chemical', 'composition', ], 'category': ['category', 'mineral class', 'classification', 'group'], 'crystal_system': ['crystal system', 'crystalsystem', 'crystal class', 'system'], 'mohs_hardness': ['mohs scalehardness', 'mohs scale hardness', 'hardness', 'mohs hardness'], 'luster': ['luster', 'lustre', 'luster (mineralogy)'], 'streak': ['streak', 'streak color'], 'specific_gravity': ['specific gravity', 'density', 'specificgravity', 'relative density'], 'color_description': ['color', 'colour', 'color/pleochroism'], } def match_field(info, candidates): """Find the first matching key from candidates in the info dict.""" for c in candidates: for key, val in info.items(): if c in key: return val return '' def _clean_text(text): """Remove citation marks and normalize whitespace.""" text = re.sub(r'\[[\d,\s]+\]', '', text) text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE) text = re.sub(r'\[clarification needed\]', '', text, flags=re.IGNORECASE) # Normalize whitespace (collapse multiple spaces, fix space before punctuation) text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s+([.,;:!?)])', r'\1', text) text = re.sub(r'(\()\s+', r'\1', text) return text.strip() def _find_heading_wrapper(tag): """Return the wrapper div if the heading is inside mw-heading, else the tag itself.""" parent = tag.parent if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []): return parent return tag def extract_description(soup): """Get the first few paragraphs of the article (before any section heading).""" paragraphs = [] for p in soup.find_all('p'): text = p.get_text(' ', strip=True) if len(text) > 50: paragraphs.append(_clean_text(text)) if len(paragraphs) >= 3: break return '\n\n'.join(paragraphs) def _collect_section_paragraphs(start_element, max_paras=2): """Collect paragraphs after a heading element until the next heading.""" parts = [] heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'} sibling = start_element.find_next_sibling() while sibling: # Stop at next heading (div.mw-heading or bare h2/h3) if sibling.name in ['h2', 'h3']: break if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []): break if sibling.name == 'p': text = sibling.get_text(' ', strip=True) if len(text) > 30: parts.append(_clean_text(text)) if len(parts) >= max_paras: break sibling = sibling.find_next_sibling() return parts def extract_history(soup): """Try to find history/etymology section.""" history_headers = ['history', 'etymology', 'discovery', 'naming', 'occurrence'] # Search both bare headings and headings inside mw-heading divs for header_tag in soup.find_all(['h2', 'h3']): header_text = header_tag.get_text(strip=True).lower() header_text = re.sub(r'\[edit\]$', '', header_text).strip() if any(h in header_text for h in history_headers): wrapper = _find_heading_wrapper(header_tag) parts = _collect_section_paragraphs(wrapper) if parts: return '\n\n'.join(parts) return '' class Command(BaseCommand): help = 'Scrape mineral data from Wikipedia' def add_arguments(self, parser): parser.add_argument('--limit', type=int, default=0, help='Max minerals to scrape (0 = all)') parser.add_argument('--dry-run', action='store_true', help='List minerals without saving') parser.add_argument('--skip-existing', action='store_true', help='Skip already-saved minerals') def handle(self, *args, **options): limit = options['limit'] dry_run = options['dry_run'] skip_existing = options['skip_existing'] minerals = MINERAL_LIST if limit: minerals = minerals[:limit] self.stdout.write(f'Processing {len(minerals)} minerals...\n') success = 0 skipped = 0 failed = 0 for i, (name, color_hex) in enumerate(minerals, 1): display_name_check = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip() if skip_existing and Mineral.objects.filter(name=display_name_check).exists(): self.stdout.write(f' [{i}/{len(minerals)}] SKIP {name} (already exists)') skipped += 1 continue if dry_run: self.stdout.write(f' [{i}/{len(minerals)}] {name} ({color_hex})') continue self.stdout.write(f' [{i}/{len(minerals)}] Scraping {name}...', ending='') try: parsed = get_wikipedia_page(name) if not parsed: self.stdout.write(self.style.WARNING(' NOT FOUND')) failed += 1 continue html = parsed['text']['*'] soup = BeautifulSoup(html, 'html.parser') # Skip actual disambiguation pages (they have the dmbox class) if soup.find('table', id='disambigbox') or soup.find('div', class_='dmbox'): self.stdout.write(self.style.WARNING(' DISAMBIGUATION - SKIPPED')) failed += 1 continue info = extract_infobox(soup) # Strip Wikipedia disambiguation suffixes from display name display_name = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip() image_urls = get_image_urls(parsed, limit=4) description = extract_description(soup) history = extract_history(soup) mineral, created = Mineral.objects.update_or_create( name=display_name, defaults={ 'formula': match_field(info, FIELD_MAPPINGS['formula'])[:200], 'category': match_field(info, FIELD_MAPPINGS['category'])[:200], 'crystal_system': match_field(info, FIELD_MAPPINGS['crystal_system'])[:200], 'mohs_hardness': match_field(info, FIELD_MAPPINGS['mohs_hardness'])[:50], 'luster': match_field(info, FIELD_MAPPINGS['luster'])[:200], 'streak': match_field(info, FIELD_MAPPINGS['streak'])[:200], 'specific_gravity': match_field(info, FIELD_MAPPINGS['specific_gravity'])[:100], 'color_description': match_field(info, FIELD_MAPPINGS['color_description'])[:300], 'color_hex': color_hex, 'description': description, 'history': history, 'image_urls': image_urls, 'wikipedia_url': f'https://en.wikipedia.org/wiki/{name.replace(" ", "_")}', 'day_of_year': i, }, ) status = 'CREATED' if created else 'UPDATED' img_count = len(image_urls) self.stdout.write(self.style.SUCCESS(f' {status} ({img_count} images)')) success += 1 except Exception as e: self.stdout.write(self.style.ERROR(f' ERROR: {e}')) failed += 1 # Be polite to Wikipedia — ~3s between minerals keeps us under rate limits time.sleep(3) self.stdout.write(f'\nDone: {success} saved, {skipped} skipped, {failed} failed')