All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
New dailystone app with 207 minerals scraped from Wikipedia. Each day displays a different mineral with photos, formula, properties, description, and history. Page theme color matches the mineral's typical appearance. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
677 lines
22 KiB
Python
677 lines
22 KiB
Python
"""
|
|
Scrape mineral data from Wikipedia for the Daily Stone feature.
|
|
|
|
Usage:
|
|
python manage.py scrape_minerals # scrape all ~365 minerals
|
|
python manage.py scrape_minerals --limit 10 # scrape first 10 only
|
|
python manage.py scrape_minerals --dry-run # just list names, don't save
|
|
"""
|
|
import re
|
|
import time
|
|
import hashlib
|
|
import json
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from django.core.management.base import BaseCommand
|
|
|
|
from dailystone.models import Mineral
|
|
|
|
# Curated list of well-known, visually interesting minerals with approximate colors.
|
|
# Color hex is a rough representative of the mineral's typical appearance.
|
|
MINERAL_LIST = [
|
|
("Quartz", "#f5f5f5"),
|
|
("Amethyst", "#9b59b6"),
|
|
("Rose quartz", "#f4a7b9"),
|
|
("Citrine (quartz)", "#f0c420"),
|
|
("Diamond", "#e8e8e8"),
|
|
("Ruby", "#e0115f"),
|
|
("Sapphire", "#0f52ba"),
|
|
("Emerald", "#50c878"),
|
|
("Topaz", "#ffc87c"),
|
|
("Opal", "#a8c3bc"),
|
|
("Turquoise (mineral)", "#40e0d0"),
|
|
("Garnet", "#733635"),
|
|
("Peridot", "#b4c424"),
|
|
("Aquamarine", "#7fffd4"),
|
|
("Tanzanite", "#4d5ba8"),
|
|
("Malachite", "#0bda51"),
|
|
("Lapis lazuli", "#26619c"),
|
|
("Jade", "#00a86b"),
|
|
("Obsidian", "#3d3635"),
|
|
("Pyrite", "#c5a647"),
|
|
("Hematite", "#5c5858"),
|
|
("Magnetite", "#353535"),
|
|
("Calcite", "#f5deb3"),
|
|
("Fluorite", "#7b68ee"),
|
|
("Apatite", "#509987"),
|
|
("Beryl", "#c1f0c1"),
|
|
("Spinel", "#ff4040"),
|
|
("Zircon", "#c4b19e"),
|
|
("Alexandrite", "#568c4c"),
|
|
("Tourmaline", "#86c67c"),
|
|
("Moonstone", "#c5cfe0"),
|
|
("Sunstone", "#e07020"),
|
|
("Labradorite", "#5678a0"),
|
|
("Rhodonite", "#e87ea1"),
|
|
("Rhodochrosite", "#e55b6e"),
|
|
("Azurite", "#2d5da1"),
|
|
("Chrysocolla", "#4cb9a0"),
|
|
("Cuprite", "#a52a2a"),
|
|
("Dioptase", "#209d7d"),
|
|
("Wulfenite", "#e68a00"),
|
|
("Vanadinite", "#cc3333"),
|
|
("Crocoite", "#e74c3c"),
|
|
("Realgar", "#e34234"),
|
|
("Orpiment", "#e9a820"),
|
|
("Stibnite", "#708090"),
|
|
("Galena", "#6b6e70"),
|
|
("Cinnabar", "#e44d2e"),
|
|
("Barite", "#c8c8c0"),
|
|
("Celestine (mineral)", "#a8d8ea"),
|
|
("Gypsum", "#f0ece2"),
|
|
("Halite", "#f0f0f0"),
|
|
("Sylvite", "#e0c0a0"),
|
|
("Sulfur", "#edda09"),
|
|
("Copper", "#b87333"),
|
|
("Gold", "#ffd700"),
|
|
("Silver", "#c0c0c0"),
|
|
("Platinum", "#e5e4e2"),
|
|
("Bismuth", "#969696"),
|
|
("Antimony", "#7b8c8a"),
|
|
("Arsenic", "#808080"),
|
|
("Graphite", "#474747"),
|
|
("Corundum", "#d9413c"),
|
|
("Spodumene", "#d8bfd8"),
|
|
("Kunzite", "#e6a8d7"),
|
|
("Hiddenite", "#98fb98"),
|
|
("Chrysoberyl", "#e8d44d"),
|
|
("Nephrite", "#638b57"),
|
|
("Jadeite", "#00a86b"),
|
|
("Serpentine subgroup", "#6b8e23"),
|
|
("Chalcopyrite", "#b8860b"),
|
|
("Bornite", "#8b6914"),
|
|
("Covellite", "#4169e1"),
|
|
("Molybdenite", "#6e6e6e"),
|
|
("Sphalerite", "#a0522d"),
|
|
("Wurtzite", "#8b4513"),
|
|
("Cassiterite", "#5c4033"),
|
|
("Rutile", "#b22222"),
|
|
("Anatase", "#4682b4"),
|
|
("Brookite", "#8b5e3b"),
|
|
("Ilmenite", "#404040"),
|
|
("Goethite", "#7b6b3a"),
|
|
("Limonite", "#9a7b4f"),
|
|
("Siderite", "#8b7d6b"),
|
|
("Magnesite", "#ede6d6"),
|
|
("Dolomite", "#dfc8a8"),
|
|
("Aragonite", "#faebd7"),
|
|
("Smithsonite", "#7ec8c8"),
|
|
("Cerussite", "#c8c8c0"),
|
|
("Witherite", "#e8e0d8"),
|
|
("Strontianite", "#c8d8c0"),
|
|
("Ankerite", "#c8b890"),
|
|
("Olivine", "#9ab973"),
|
|
("Forsterite", "#96be50"),
|
|
("Fayalite", "#6b5c3e"),
|
|
("Augite", "#2e4032"),
|
|
("Diopside", "#507856"),
|
|
("Enstatite", "#908870"),
|
|
("Hypersthene", "#5b5e4e"),
|
|
("Wollastonite", "#e8e0d8"),
|
|
("Tremolite", "#e0e8d8"),
|
|
("Actinolite", "#2d8b57"),
|
|
("Hornblende", "#3b4838"),
|
|
("Glaucophane", "#5b6db8"),
|
|
("Riebeckite", "#2f4f4f"),
|
|
("Muscovite", "#d4c48d"),
|
|
("Biotite", "#4a3c28"),
|
|
("Phlogopite", "#c4a35a"),
|
|
("Lepidolite", "#c8a2c8"),
|
|
("Talc", "#e8e8e0"),
|
|
("Kaolinite", "#f0e8d8"),
|
|
("Montmorillonite", "#c8b090"),
|
|
("Vermiculite", "#b89c78"),
|
|
("Chlorite group", "#6b8f47"),
|
|
("Prehnite", "#c8e8a0"),
|
|
("Epidote", "#7b8b2e"),
|
|
("Zoisite", "#6b8b73"),
|
|
("Clinozoisite", "#7b9b6b"),
|
|
("Vesuvianite", "#6b8040"),
|
|
("Pumpellyite", "#447744"),
|
|
("Lawsonite", "#8090a0"),
|
|
("Andalusite", "#b08080"),
|
|
("Sillimanite", "#c8c0b8"),
|
|
("Kyanite", "#5b8fbe"),
|
|
("Staurolite", "#7b5b3b"),
|
|
("Cordierite", "#6666aa"),
|
|
("Sodalite", "#3c578e"),
|
|
("Lazurite", "#26619c"),
|
|
("Hauyne", "#4466bb"),
|
|
("Leucite", "#d8d0c8"),
|
|
("Nepheline", "#c8c0a8"),
|
|
("Scapolite", "#d0c8b0"),
|
|
("Danburite", "#e8e0d0"),
|
|
("Datolite", "#d4e8d0"),
|
|
("Titanite", "#b8a048"),
|
|
("Dumortierite", "#4060a0"),
|
|
("Hemimorphite", "#98d8e8"),
|
|
("Willemite", "#70b020"),
|
|
("Phenakite", "#e0e0d8"),
|
|
("Euclase", "#80b8d8"),
|
|
("Bertrandite", "#d8d0c0"),
|
|
("Chrysoprase", "#79a868"),
|
|
("Carnelian", "#b5462a"),
|
|
("Jasper", "#ce4a2f"),
|
|
("Agate", "#b0a090"),
|
|
("Onyx", "#353839"),
|
|
("Chalcedony", "#c8d0d8"),
|
|
("Tiger's eye", "#b8860b"),
|
|
("Hawk's eye", "#4c6c8c"),
|
|
("Bloodstone", "#3b6e3f"),
|
|
("Aventurine", "#568b52"),
|
|
("Amazonite", "#4c8c7a"),
|
|
("Larvikite", "#4a5060"),
|
|
("Charoite", "#7b4e8a"),
|
|
("Sugilite", "#8b4789"),
|
|
("Larimar", "#88c8de"),
|
|
("Pietersite", "#4a5c40"),
|
|
("Moldavite", "#6b8e23"),
|
|
("Tektite", "#3a3a3a"),
|
|
("Shungite", "#2c2c2c"),
|
|
("Seraphinite", "#4a7c5a"),
|
|
("Astrophyllite", "#8b6c2a"),
|
|
("Nuummite", "#3a3a3a"),
|
|
("Howlite", "#e8e0d8"),
|
|
("Magnesite", "#ede6d6"),
|
|
("Sodalite", "#3c578e"),
|
|
("Unakite", "#7a8a5a"),
|
|
("Variscite", "#50b848"),
|
|
("Wavellite", "#78b868"),
|
|
("Vivianite", "#2e5e8e"),
|
|
("Erythrite", "#d84888"),
|
|
("Annabergite", "#58b858"),
|
|
("Adamite", "#b8e830"),
|
|
("Legrandite", "#e8d830"),
|
|
("Aurichalcite", "#78c8b8"),
|
|
("Rosasite", "#58a8a8"),
|
|
("Hemimorphite", "#98d8e8"),
|
|
("Cavansite", "#3070c8"),
|
|
("Pentlandite", "#b8a830"),
|
|
("Millerite", "#b8a040"),
|
|
("Nickeline", "#c8a088"),
|
|
("Skutterudite", "#808080"),
|
|
("Cobaltite", "#808888"),
|
|
("Arsenopyrite", "#808888"),
|
|
("Marcasite", "#c0b838"),
|
|
("Pyrrhotite", "#a09048"),
|
|
("Pentlandite", "#b8a830"),
|
|
("Chromite", "#404040"),
|
|
("Spessartine", "#e86838"),
|
|
("Almandine", "#a03050"),
|
|
("Pyrope", "#c82040"),
|
|
("Grossular", "#80b840"),
|
|
("Andradite", "#686830"),
|
|
("Uvarovite", "#388838"),
|
|
("Tsavorite", "#38a848"),
|
|
("Demantoid", "#58a838"),
|
|
("Melanite", "#303030"),
|
|
("Topazolite", "#d8c838"),
|
|
("Schorl", "#2c2c2c"),
|
|
("Elbaite", "#48b888"),
|
|
("Dravite", "#8b6c3a"),
|
|
("Indicolite", "#287888"),
|
|
("Rubellite", "#c83868"),
|
|
("Paraiba tourmaline", "#00b8c8"),
|
|
("Watermelon tourmaline", "#78b858"),
|
|
("Tephroite", "#7a6a50"),
|
|
("Rhodolite", "#c84878"),
|
|
("Iolite", "#5858a8"),
|
|
("Scolecite", "#e8e0e0"),
|
|
("Natrolite", "#e0e0d0"),
|
|
("Stilbite", "#e8b898"),
|
|
("Heulandite", "#e0c090"),
|
|
("Apophyllite", "#c8e8d0"),
|
|
("Analcime", "#e0e0d0"),
|
|
("Chabazite", "#e0c898"),
|
|
("Phillipsite", "#d0c8b0"),
|
|
("Thomsonite", "#d8d0c0"),
|
|
("Mesolite", "#e8e0d8"),
|
|
("Laumontite", "#e0d0b0"),
|
|
("Mordenite", "#e0d8c8"),
|
|
("Clinoptilolite", "#d8d0c0"),
|
|
("Erionite", "#e0d8d0"),
|
|
("Colemanite", "#e0d8c8"),
|
|
("Ulexite", "#f0e8e0"),
|
|
("Borax", "#e8e0d8"),
|
|
("Kernite", "#e0d8d0"),
|
|
("Tincalconite", "#e8e0d8"),
|
|
("Sassolite", "#e0e0d0"),
|
|
("Boracite", "#c8d8c0"),
|
|
("Sinhalite", "#c0a870"),
|
|
("Kornerupine", "#587848"),
|
|
("Grandidierite", "#4898a8"),
|
|
("Serendibite", "#384838"),
|
|
("Taaffeite", "#c888c8"),
|
|
("Painite", "#a06040"),
|
|
("Musgravite", "#808878"),
|
|
("Jeremejevite", "#a8c8e0"),
|
|
("Poudretteite", "#e8c0d8"),
|
|
("Benitoite", "#3858c8"),
|
|
("Neptunite", "#383028"),
|
|
("Joaquinite", "#a88030"),
|
|
("Sanbornite", "#e0d8c8"),
|
|
("Fresnoite", "#e0d030"),
|
|
("Celsian", "#d8d0c0"),
|
|
("Hyalophane", "#d0c8b0"),
|
|
("Harmotome", "#d8d0c0"),
|
|
("Pectolite", "#d0e0e0"),
|
|
("Okenite", "#f0e8e0"),
|
|
("Gyrolite", "#e0e8d0"),
|
|
("Tobermorite", "#d8d0c8"),
|
|
("Xonotlite", "#e0d8d0"),
|
|
("Thaumasite", "#e8e0d8"),
|
|
("Ettringite", "#e8e838"),
|
|
("Sturmanite", "#e8e030"),
|
|
("Charlesite", "#e0e0c8"),
|
|
("Afwillite", "#e0d8d0"),
|
|
("Hillebrandite", "#e0d8c8"),
|
|
("Foshagite", "#e0e0d0"),
|
|
("Jennite", "#d8c8b8"),
|
|
("Suolunite", "#d8d0c0"),
|
|
("Rosenbuschite", "#c8a070"),
|
|
("Eudialyte", "#c84860"),
|
|
("Catapleiite", "#c8c0b0"),
|
|
("Lorenzenite", "#584838"),
|
|
("Ramsayite", "#685838"),
|
|
("Lamprophyllite", "#a08030"),
|
|
("Murmanite", "#907050"),
|
|
("Lomonosovite", "#886040"),
|
|
("Vuonnemite", "#b89050"),
|
|
("Villiaumite", "#e8a030"),
|
|
("Ussingite", "#c8a0b8"),
|
|
("Chkalovite", "#d8d0c8"),
|
|
("Tugtupite", "#e0586e"),
|
|
("Sorensenite", "#d8d0c8"),
|
|
("Tinguaite", "#586850"),
|
|
("Cancrinite", "#e0c030"),
|
|
("Vishnevite", "#c8c0a0"),
|
|
("Davyne", "#d0c890"),
|
|
("Microsommite", "#d0c890"),
|
|
("Nosean", "#707888"),
|
|
("Hackmanite", "#9870a0"),
|
|
("Tugtupite", "#e0586e"),
|
|
("Pargasite", "#386838"),
|
|
("Edenite", "#507848"),
|
|
("Kaersutite", "#483830"),
|
|
("Richterite", "#586880"),
|
|
("Winchite", "#607060"),
|
|
("Barroisite", "#506860"),
|
|
("Gedrite", "#606058"),
|
|
("Anthophyllite", "#807860"),
|
|
("Cummingtonite", "#787068"),
|
|
("Grunerite", "#686058"),
|
|
("Holmquistite", "#5868a0"),
|
|
("Sapphirine", "#4060a8"),
|
|
("Kornerupine", "#587848"),
|
|
("Prismatine", "#586048"),
|
|
("Boralsilite", "#d0c8b8"),
|
|
("Werdingite", "#a8a098"),
|
|
("Grandidierite", "#4898a8"),
|
|
("Ominelite", "#404038"),
|
|
("Serendibite", "#384838"),
|
|
("Sinhalite", "#c0a870"),
|
|
("Taafeite", "#c888c8"),
|
|
("Musgravite", "#808878"),
|
|
("Johachidolite", "#e0c070"),
|
|
("Painite", "#a06040"),
|
|
("Jeremejevite", "#a8c8e0"),
|
|
("Poudretteite", "#e8c0d8"),
|
|
("Benitoite", "#3858c8"),
|
|
("Neptunite", "#383028"),
|
|
("Joaquinite", "#a88030"),
|
|
("Sanbornite", "#e0d8c8"),
|
|
("Howlite", "#e8e0d8"),
|
|
("Magnesite", "#ede6d6"),
|
|
("Selenite (mineral)", "#f0eee0"),
|
|
("Desert rose (crystal)", "#d8b890"),
|
|
("Fulgurite", "#c8b898"),
|
|
("Tektite", "#3a3a3a"),
|
|
("Meteorite", "#686058"),
|
|
("Pallasite", "#a09048"),
|
|
("Kamacite", "#909090"),
|
|
("Taenite", "#a0a0a0"),
|
|
("Troilite", "#886838"),
|
|
("Schreibersite", "#a0a098"),
|
|
("Cohenite", "#686060"),
|
|
("Moissanite", "#b8e8c8"),
|
|
("Lonsdaleite", "#c8c8c0"),
|
|
("Stishovite", "#d0d0c8"),
|
|
("Coesite", "#c8c8c0"),
|
|
("Seifertite", "#c0c0b8"),
|
|
("Ringwoodite", "#5878c8"),
|
|
("Bridgmanite", "#a0a098"),
|
|
("Davemaoite", "#a8a0a0"),
|
|
("Ice", "#e0f0f8"),
|
|
("Dry ice", "#e8e8f0"),
|
|
("Sal ammoniac", "#e0e0d8"),
|
|
("Niter", "#e8e0d8"),
|
|
("Natron", "#e0d8c8"),
|
|
("Trona", "#d8d0c0"),
|
|
("Thermonatrite", "#e0d8d0"),
|
|
("Gaylussite", "#d8d0c0"),
|
|
("Pirssonite", "#d0c8c0"),
|
|
("Shortite", "#e0d838"),
|
|
("Northupite", "#d8d0c0"),
|
|
("Eitelite", "#d0c8c0"),
|
|
("Bradleyite", "#c8c0b0"),
|
|
("Tychite", "#c8c0b0"),
|
|
("Schairerite", "#d0c8b8"),
|
|
("Sulfohalite", "#d0c8b8"),
|
|
("Kogarkoite", "#d0c8c0"),
|
|
]
|
|
|
|
# Deduplicate by name (keep first occurrence)
|
|
_seen = set()
|
|
_deduped = []
|
|
for name, color in MINERAL_LIST:
|
|
key = name.lower()
|
|
if key not in _seen:
|
|
_seen.add(key)
|
|
_deduped.append((name, color))
|
|
MINERAL_LIST = _deduped
|
|
|
|
SESSION = None
|
|
|
|
|
|
def get_session():
|
|
global SESSION
|
|
if SESSION is None:
|
|
SESSION = requests.Session()
|
|
SESSION.headers.update({
|
|
'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
|
|
})
|
|
return SESSION
|
|
|
|
|
|
def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
|
|
"""Make a request with backoff on 429 errors, respecting Retry-After."""
|
|
for attempt in range(max_retries):
|
|
resp = session.get(url, params=params, timeout=timeout)
|
|
if resp.status_code == 429:
|
|
retry_after = resp.headers.get('Retry-After')
|
|
if retry_after and retry_after.isdigit():
|
|
wait = min(int(retry_after) + 1, 120) # Cap at 2 minutes
|
|
else:
|
|
wait = 10 * (2 ** attempt) # 10, 20, 40, 80, 160
|
|
time.sleep(wait)
|
|
continue
|
|
resp.raise_for_status()
|
|
return resp
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
def get_wikipedia_page(title):
|
|
"""Fetch parsed Wikipedia page via the API."""
|
|
session = get_session()
|
|
resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
|
|
'action': 'parse',
|
|
'page': title,
|
|
'prop': 'text|images',
|
|
'format': 'json',
|
|
'redirects': 1,
|
|
})
|
|
data = resp.json()
|
|
if 'error' in data:
|
|
return None
|
|
return data['parse']
|
|
|
|
|
|
def get_image_urls(parse_data, limit=4):
|
|
"""Get actual image URLs from the parsed page's image list."""
|
|
session = get_session()
|
|
images = parse_data.get('images', [])
|
|
# Filter out icons, logos, SVGs
|
|
good = [
|
|
img for img in images
|
|
if not any(skip in img.lower() for skip in [
|
|
'icon', 'logo', 'symbol', 'flag', 'commons-logo', 'wiki',
|
|
'question_mark', 'edit-clear', 'ambox', 'crystal_clear',
|
|
'lock-', 'padlock', 'red_pencil', 'text-', 'globe_',
|
|
'folder_', 'nuvola', 'gnome-', 'information', '.svg',
|
|
'wiktionary', 'disambig', 'merge-', 'split-', 'portal-',
|
|
])
|
|
]
|
|
if not good:
|
|
good = [img for img in images if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
|
|
|
|
urls = []
|
|
for img_name in good[:limit * 2]:
|
|
try:
|
|
resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
|
|
'action': 'query',
|
|
'titles': f'File:{img_name}',
|
|
'prop': 'imageinfo',
|
|
'iiprop': 'url|size',
|
|
'iiurlwidth': 800,
|
|
'format': 'json',
|
|
}, timeout=15)
|
|
pages = resp.json()['query']['pages']
|
|
for page in pages.values():
|
|
if 'imageinfo' in page:
|
|
info = page['imageinfo'][0]
|
|
thumb = info.get('thumburl', info.get('url', ''))
|
|
if thumb:
|
|
urls.append(thumb)
|
|
if len(urls) >= limit:
|
|
return urls
|
|
except Exception:
|
|
continue
|
|
return urls
|
|
|
|
|
|
def extract_infobox(soup):
|
|
"""Extract key-value pairs from a mineral infobox."""
|
|
info = {}
|
|
table = soup.find('table', class_='infobox')
|
|
if not table:
|
|
return info
|
|
|
|
for row in table.find_all('tr'):
|
|
th = row.find('th')
|
|
td = row.find('td')
|
|
if th and td:
|
|
key = th.get_text(strip=True).lower()
|
|
val = td.get_text(' ', strip=True)
|
|
info[key] = val
|
|
return info
|
|
|
|
|
|
FIELD_MAPPINGS = {
|
|
'formula': [
|
|
'formula', 'chemical formula', 'idealformula',
|
|
'formula(repeating unit)', 'chemical', 'composition',
|
|
],
|
|
'category': ['category', 'mineral class', 'classification', 'group'],
|
|
'crystal_system': ['crystal system', 'crystalsystem', 'crystal class', 'system'],
|
|
'mohs_hardness': ['mohs scalehardness', 'mohs scale hardness', 'hardness', 'mohs hardness'],
|
|
'luster': ['luster', 'lustre', 'luster (mineralogy)'],
|
|
'streak': ['streak', 'streak color'],
|
|
'specific_gravity': ['specific gravity', 'density', 'specificgravity', 'relative density'],
|
|
'color_description': ['color', 'colour', 'color/pleochroism'],
|
|
}
|
|
|
|
|
|
def match_field(info, candidates):
|
|
"""Find the first matching key from candidates in the info dict."""
|
|
for c in candidates:
|
|
for key, val in info.items():
|
|
if c in key:
|
|
return val
|
|
return ''
|
|
|
|
|
|
def _clean_text(text):
|
|
"""Remove citation marks and normalize whitespace."""
|
|
text = re.sub(r'\[[\d,\s]+\]', '', text)
|
|
text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
|
|
text = re.sub(r'\[clarification needed\]', '', text, flags=re.IGNORECASE)
|
|
# Normalize whitespace (collapse multiple spaces, fix space before punctuation)
|
|
text = re.sub(r'\s+', ' ', text)
|
|
text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
|
|
text = re.sub(r'(\()\s+', r'\1', text)
|
|
return text.strip()
|
|
|
|
|
|
def _find_heading_wrapper(tag):
|
|
"""Return the wrapper div if the heading is inside mw-heading, else the tag itself."""
|
|
parent = tag.parent
|
|
if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
|
|
return parent
|
|
return tag
|
|
|
|
|
|
def extract_description(soup):
|
|
"""Get the first few paragraphs of the article (before any section heading)."""
|
|
paragraphs = []
|
|
for p in soup.find_all('p'):
|
|
text = p.get_text(' ', strip=True)
|
|
if len(text) > 50:
|
|
paragraphs.append(_clean_text(text))
|
|
if len(paragraphs) >= 3:
|
|
break
|
|
return '\n\n'.join(paragraphs)
|
|
|
|
|
|
def _collect_section_paragraphs(start_element, max_paras=2):
|
|
"""Collect paragraphs after a heading element until the next heading."""
|
|
parts = []
|
|
heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
|
|
sibling = start_element.find_next_sibling()
|
|
while sibling:
|
|
# Stop at next heading (div.mw-heading or bare h2/h3)
|
|
if sibling.name in ['h2', 'h3']:
|
|
break
|
|
if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
|
|
break
|
|
if sibling.name == 'p':
|
|
text = sibling.get_text(' ', strip=True)
|
|
if len(text) > 30:
|
|
parts.append(_clean_text(text))
|
|
if len(parts) >= max_paras:
|
|
break
|
|
sibling = sibling.find_next_sibling()
|
|
return parts
|
|
|
|
|
|
def extract_history(soup):
|
|
"""Try to find history/etymology section."""
|
|
history_headers = ['history', 'etymology', 'discovery', 'naming', 'occurrence']
|
|
|
|
# Search both bare headings and headings inside mw-heading divs
|
|
for header_tag in soup.find_all(['h2', 'h3']):
|
|
header_text = header_tag.get_text(strip=True).lower()
|
|
header_text = re.sub(r'\[edit\]$', '', header_text).strip()
|
|
if any(h in header_text for h in history_headers):
|
|
wrapper = _find_heading_wrapper(header_tag)
|
|
parts = _collect_section_paragraphs(wrapper)
|
|
if parts:
|
|
return '\n\n'.join(parts)
|
|
return ''
|
|
|
|
|
|
class Command(BaseCommand):
|
|
help = 'Scrape mineral data from Wikipedia'
|
|
|
|
def add_arguments(self, parser):
|
|
parser.add_argument('--limit', type=int, default=0, help='Max minerals to scrape (0 = all)')
|
|
parser.add_argument('--dry-run', action='store_true', help='List minerals without saving')
|
|
parser.add_argument('--skip-existing', action='store_true', help='Skip already-saved minerals')
|
|
|
|
def handle(self, *args, **options):
|
|
limit = options['limit']
|
|
dry_run = options['dry_run']
|
|
skip_existing = options['skip_existing']
|
|
|
|
minerals = MINERAL_LIST
|
|
if limit:
|
|
minerals = minerals[:limit]
|
|
|
|
self.stdout.write(f'Processing {len(minerals)} minerals...\n')
|
|
|
|
success = 0
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
for i, (name, color_hex) in enumerate(minerals, 1):
|
|
display_name_check = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip()
|
|
if skip_existing and Mineral.objects.filter(name=display_name_check).exists():
|
|
self.stdout.write(f' [{i}/{len(minerals)}] SKIP {name} (already exists)')
|
|
skipped += 1
|
|
continue
|
|
|
|
if dry_run:
|
|
self.stdout.write(f' [{i}/{len(minerals)}] {name} ({color_hex})')
|
|
continue
|
|
|
|
self.stdout.write(f' [{i}/{len(minerals)}] Scraping {name}...', ending='')
|
|
|
|
try:
|
|
parsed = get_wikipedia_page(name)
|
|
if not parsed:
|
|
self.stdout.write(self.style.WARNING(' NOT FOUND'))
|
|
failed += 1
|
|
continue
|
|
|
|
html = parsed['text']['*']
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Skip actual disambiguation pages (they have the dmbox class)
|
|
if soup.find('table', id='disambigbox') or soup.find('div', class_='dmbox'):
|
|
self.stdout.write(self.style.WARNING(' DISAMBIGUATION - SKIPPED'))
|
|
failed += 1
|
|
continue
|
|
info = extract_infobox(soup)
|
|
|
|
# Strip Wikipedia disambiguation suffixes from display name
|
|
display_name = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip()
|
|
|
|
image_urls = get_image_urls(parsed, limit=4)
|
|
description = extract_description(soup)
|
|
history = extract_history(soup)
|
|
|
|
mineral, created = Mineral.objects.update_or_create(
|
|
name=display_name,
|
|
defaults={
|
|
'formula': match_field(info, FIELD_MAPPINGS['formula'])[:200],
|
|
'category': match_field(info, FIELD_MAPPINGS['category'])[:200],
|
|
'crystal_system': match_field(info, FIELD_MAPPINGS['crystal_system'])[:200],
|
|
'mohs_hardness': match_field(info, FIELD_MAPPINGS['mohs_hardness'])[:50],
|
|
'luster': match_field(info, FIELD_MAPPINGS['luster'])[:200],
|
|
'streak': match_field(info, FIELD_MAPPINGS['streak'])[:200],
|
|
'specific_gravity': match_field(info, FIELD_MAPPINGS['specific_gravity'])[:100],
|
|
'color_description': match_field(info, FIELD_MAPPINGS['color_description'])[:300],
|
|
'color_hex': color_hex,
|
|
'description': description,
|
|
'history': history,
|
|
'image_urls': image_urls,
|
|
'wikipedia_url': f'https://en.wikipedia.org/wiki/{name.replace(" ", "_")}',
|
|
'day_of_year': i,
|
|
},
|
|
)
|
|
|
|
status = 'CREATED' if created else 'UPDATED'
|
|
img_count = len(image_urls)
|
|
self.stdout.write(self.style.SUCCESS(f' {status} ({img_count} images)'))
|
|
success += 1
|
|
|
|
except Exception as e:
|
|
self.stdout.write(self.style.ERROR(f' ERROR: {e}'))
|
|
failed += 1
|
|
|
|
# Be polite to Wikipedia — ~3s between minerals keeps us under rate limits
|
|
time.sleep(3)
|
|
|
|
self.stdout.write(f'\nDone: {success} saved, {skipped} skipped, {failed} failed')
|