Add daily-stone page showing a different mineral each day
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
New dailystone app with 207 minerals scraped from Wikipedia. Each day displays a different mineral with photos, formula, properties, description, and history. Page theme color matches the mineral's typical appearance. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
676
dailystone/management/commands/scrape_minerals.py
Normal file
676
dailystone/management/commands/scrape_minerals.py
Normal file
@@ -0,0 +1,676 @@
|
||||
"""
|
||||
Scrape mineral data from Wikipedia for the Daily Stone feature.
|
||||
|
||||
Usage:
|
||||
python manage.py scrape_minerals # scrape all ~365 minerals
|
||||
python manage.py scrape_minerals --limit 10 # scrape first 10 only
|
||||
python manage.py scrape_minerals --dry-run # just list names, don't save
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from dailystone.models import Mineral
|
||||
|
||||
# Curated list of well-known, visually interesting minerals with approximate colors.
|
||||
# Color hex is a rough representative of the mineral's typical appearance.
|
||||
MINERAL_LIST = [
|
||||
("Quartz", "#f5f5f5"),
|
||||
("Amethyst", "#9b59b6"),
|
||||
("Rose quartz", "#f4a7b9"),
|
||||
("Citrine (quartz)", "#f0c420"),
|
||||
("Diamond", "#e8e8e8"),
|
||||
("Ruby", "#e0115f"),
|
||||
("Sapphire", "#0f52ba"),
|
||||
("Emerald", "#50c878"),
|
||||
("Topaz", "#ffc87c"),
|
||||
("Opal", "#a8c3bc"),
|
||||
("Turquoise (mineral)", "#40e0d0"),
|
||||
("Garnet", "#733635"),
|
||||
("Peridot", "#b4c424"),
|
||||
("Aquamarine", "#7fffd4"),
|
||||
("Tanzanite", "#4d5ba8"),
|
||||
("Malachite", "#0bda51"),
|
||||
("Lapis lazuli", "#26619c"),
|
||||
("Jade", "#00a86b"),
|
||||
("Obsidian", "#3d3635"),
|
||||
("Pyrite", "#c5a647"),
|
||||
("Hematite", "#5c5858"),
|
||||
("Magnetite", "#353535"),
|
||||
("Calcite", "#f5deb3"),
|
||||
("Fluorite", "#7b68ee"),
|
||||
("Apatite", "#509987"),
|
||||
("Beryl", "#c1f0c1"),
|
||||
("Spinel", "#ff4040"),
|
||||
("Zircon", "#c4b19e"),
|
||||
("Alexandrite", "#568c4c"),
|
||||
("Tourmaline", "#86c67c"),
|
||||
("Moonstone", "#c5cfe0"),
|
||||
("Sunstone", "#e07020"),
|
||||
("Labradorite", "#5678a0"),
|
||||
("Rhodonite", "#e87ea1"),
|
||||
("Rhodochrosite", "#e55b6e"),
|
||||
("Azurite", "#2d5da1"),
|
||||
("Chrysocolla", "#4cb9a0"),
|
||||
("Cuprite", "#a52a2a"),
|
||||
("Dioptase", "#209d7d"),
|
||||
("Wulfenite", "#e68a00"),
|
||||
("Vanadinite", "#cc3333"),
|
||||
("Crocoite", "#e74c3c"),
|
||||
("Realgar", "#e34234"),
|
||||
("Orpiment", "#e9a820"),
|
||||
("Stibnite", "#708090"),
|
||||
("Galena", "#6b6e70"),
|
||||
("Cinnabar", "#e44d2e"),
|
||||
("Barite", "#c8c8c0"),
|
||||
("Celestine (mineral)", "#a8d8ea"),
|
||||
("Gypsum", "#f0ece2"),
|
||||
("Halite", "#f0f0f0"),
|
||||
("Sylvite", "#e0c0a0"),
|
||||
("Sulfur", "#edda09"),
|
||||
("Copper", "#b87333"),
|
||||
("Gold", "#ffd700"),
|
||||
("Silver", "#c0c0c0"),
|
||||
("Platinum", "#e5e4e2"),
|
||||
("Bismuth", "#969696"),
|
||||
("Antimony", "#7b8c8a"),
|
||||
("Arsenic", "#808080"),
|
||||
("Graphite", "#474747"),
|
||||
("Corundum", "#d9413c"),
|
||||
("Spodumene", "#d8bfd8"),
|
||||
("Kunzite", "#e6a8d7"),
|
||||
("Hiddenite", "#98fb98"),
|
||||
("Chrysoberyl", "#e8d44d"),
|
||||
("Nephrite", "#638b57"),
|
||||
("Jadeite", "#00a86b"),
|
||||
("Serpentine subgroup", "#6b8e23"),
|
||||
("Chalcopyrite", "#b8860b"),
|
||||
("Bornite", "#8b6914"),
|
||||
("Covellite", "#4169e1"),
|
||||
("Molybdenite", "#6e6e6e"),
|
||||
("Sphalerite", "#a0522d"),
|
||||
("Wurtzite", "#8b4513"),
|
||||
("Cassiterite", "#5c4033"),
|
||||
("Rutile", "#b22222"),
|
||||
("Anatase", "#4682b4"),
|
||||
("Brookite", "#8b5e3b"),
|
||||
("Ilmenite", "#404040"),
|
||||
("Goethite", "#7b6b3a"),
|
||||
("Limonite", "#9a7b4f"),
|
||||
("Siderite", "#8b7d6b"),
|
||||
("Magnesite", "#ede6d6"),
|
||||
("Dolomite", "#dfc8a8"),
|
||||
("Aragonite", "#faebd7"),
|
||||
("Smithsonite", "#7ec8c8"),
|
||||
("Cerussite", "#c8c8c0"),
|
||||
("Witherite", "#e8e0d8"),
|
||||
("Strontianite", "#c8d8c0"),
|
||||
("Ankerite", "#c8b890"),
|
||||
("Olivine", "#9ab973"),
|
||||
("Forsterite", "#96be50"),
|
||||
("Fayalite", "#6b5c3e"),
|
||||
("Augite", "#2e4032"),
|
||||
("Diopside", "#507856"),
|
||||
("Enstatite", "#908870"),
|
||||
("Hypersthene", "#5b5e4e"),
|
||||
("Wollastonite", "#e8e0d8"),
|
||||
("Tremolite", "#e0e8d8"),
|
||||
("Actinolite", "#2d8b57"),
|
||||
("Hornblende", "#3b4838"),
|
||||
("Glaucophane", "#5b6db8"),
|
||||
("Riebeckite", "#2f4f4f"),
|
||||
("Muscovite", "#d4c48d"),
|
||||
("Biotite", "#4a3c28"),
|
||||
("Phlogopite", "#c4a35a"),
|
||||
("Lepidolite", "#c8a2c8"),
|
||||
("Talc", "#e8e8e0"),
|
||||
("Kaolinite", "#f0e8d8"),
|
||||
("Montmorillonite", "#c8b090"),
|
||||
("Vermiculite", "#b89c78"),
|
||||
("Chlorite group", "#6b8f47"),
|
||||
("Prehnite", "#c8e8a0"),
|
||||
("Epidote", "#7b8b2e"),
|
||||
("Zoisite", "#6b8b73"),
|
||||
("Clinozoisite", "#7b9b6b"),
|
||||
("Vesuvianite", "#6b8040"),
|
||||
("Pumpellyite", "#447744"),
|
||||
("Lawsonite", "#8090a0"),
|
||||
("Andalusite", "#b08080"),
|
||||
("Sillimanite", "#c8c0b8"),
|
||||
("Kyanite", "#5b8fbe"),
|
||||
("Staurolite", "#7b5b3b"),
|
||||
("Cordierite", "#6666aa"),
|
||||
("Sodalite", "#3c578e"),
|
||||
("Lazurite", "#26619c"),
|
||||
("Hauyne", "#4466bb"),
|
||||
("Leucite", "#d8d0c8"),
|
||||
("Nepheline", "#c8c0a8"),
|
||||
("Scapolite", "#d0c8b0"),
|
||||
("Danburite", "#e8e0d0"),
|
||||
("Datolite", "#d4e8d0"),
|
||||
("Titanite", "#b8a048"),
|
||||
("Dumortierite", "#4060a0"),
|
||||
("Hemimorphite", "#98d8e8"),
|
||||
("Willemite", "#70b020"),
|
||||
("Phenakite", "#e0e0d8"),
|
||||
("Euclase", "#80b8d8"),
|
||||
("Bertrandite", "#d8d0c0"),
|
||||
("Chrysoprase", "#79a868"),
|
||||
("Carnelian", "#b5462a"),
|
||||
("Jasper", "#ce4a2f"),
|
||||
("Agate", "#b0a090"),
|
||||
("Onyx", "#353839"),
|
||||
("Chalcedony", "#c8d0d8"),
|
||||
("Tiger's eye", "#b8860b"),
|
||||
("Hawk's eye", "#4c6c8c"),
|
||||
("Bloodstone", "#3b6e3f"),
|
||||
("Aventurine", "#568b52"),
|
||||
("Amazonite", "#4c8c7a"),
|
||||
("Larvikite", "#4a5060"),
|
||||
("Charoite", "#7b4e8a"),
|
||||
("Sugilite", "#8b4789"),
|
||||
("Larimar", "#88c8de"),
|
||||
("Pietersite", "#4a5c40"),
|
||||
("Moldavite", "#6b8e23"),
|
||||
("Tektite", "#3a3a3a"),
|
||||
("Shungite", "#2c2c2c"),
|
||||
("Seraphinite", "#4a7c5a"),
|
||||
("Astrophyllite", "#8b6c2a"),
|
||||
("Nuummite", "#3a3a3a"),
|
||||
("Howlite", "#e8e0d8"),
|
||||
("Magnesite", "#ede6d6"),
|
||||
("Sodalite", "#3c578e"),
|
||||
("Unakite", "#7a8a5a"),
|
||||
("Variscite", "#50b848"),
|
||||
("Wavellite", "#78b868"),
|
||||
("Vivianite", "#2e5e8e"),
|
||||
("Erythrite", "#d84888"),
|
||||
("Annabergite", "#58b858"),
|
||||
("Adamite", "#b8e830"),
|
||||
("Legrandite", "#e8d830"),
|
||||
("Aurichalcite", "#78c8b8"),
|
||||
("Rosasite", "#58a8a8"),
|
||||
("Hemimorphite", "#98d8e8"),
|
||||
("Cavansite", "#3070c8"),
|
||||
("Pentlandite", "#b8a830"),
|
||||
("Millerite", "#b8a040"),
|
||||
("Nickeline", "#c8a088"),
|
||||
("Skutterudite", "#808080"),
|
||||
("Cobaltite", "#808888"),
|
||||
("Arsenopyrite", "#808888"),
|
||||
("Marcasite", "#c0b838"),
|
||||
("Pyrrhotite", "#a09048"),
|
||||
("Pentlandite", "#b8a830"),
|
||||
("Chromite", "#404040"),
|
||||
("Spessartine", "#e86838"),
|
||||
("Almandine", "#a03050"),
|
||||
("Pyrope", "#c82040"),
|
||||
("Grossular", "#80b840"),
|
||||
("Andradite", "#686830"),
|
||||
("Uvarovite", "#388838"),
|
||||
("Tsavorite", "#38a848"),
|
||||
("Demantoid", "#58a838"),
|
||||
("Melanite", "#303030"),
|
||||
("Topazolite", "#d8c838"),
|
||||
("Schorl", "#2c2c2c"),
|
||||
("Elbaite", "#48b888"),
|
||||
("Dravite", "#8b6c3a"),
|
||||
("Indicolite", "#287888"),
|
||||
("Rubellite", "#c83868"),
|
||||
("Paraiba tourmaline", "#00b8c8"),
|
||||
("Watermelon tourmaline", "#78b858"),
|
||||
("Tephroite", "#7a6a50"),
|
||||
("Rhodolite", "#c84878"),
|
||||
("Iolite", "#5858a8"),
|
||||
("Scolecite", "#e8e0e0"),
|
||||
("Natrolite", "#e0e0d0"),
|
||||
("Stilbite", "#e8b898"),
|
||||
("Heulandite", "#e0c090"),
|
||||
("Apophyllite", "#c8e8d0"),
|
||||
("Analcime", "#e0e0d0"),
|
||||
("Chabazite", "#e0c898"),
|
||||
("Phillipsite", "#d0c8b0"),
|
||||
("Thomsonite", "#d8d0c0"),
|
||||
("Mesolite", "#e8e0d8"),
|
||||
("Laumontite", "#e0d0b0"),
|
||||
("Mordenite", "#e0d8c8"),
|
||||
("Clinoptilolite", "#d8d0c0"),
|
||||
("Erionite", "#e0d8d0"),
|
||||
("Colemanite", "#e0d8c8"),
|
||||
("Ulexite", "#f0e8e0"),
|
||||
("Borax", "#e8e0d8"),
|
||||
("Kernite", "#e0d8d0"),
|
||||
("Tincalconite", "#e8e0d8"),
|
||||
("Sassolite", "#e0e0d0"),
|
||||
("Boracite", "#c8d8c0"),
|
||||
("Sinhalite", "#c0a870"),
|
||||
("Kornerupine", "#587848"),
|
||||
("Grandidierite", "#4898a8"),
|
||||
("Serendibite", "#384838"),
|
||||
("Taaffeite", "#c888c8"),
|
||||
("Painite", "#a06040"),
|
||||
("Musgravite", "#808878"),
|
||||
("Jeremejevite", "#a8c8e0"),
|
||||
("Poudretteite", "#e8c0d8"),
|
||||
("Benitoite", "#3858c8"),
|
||||
("Neptunite", "#383028"),
|
||||
("Joaquinite", "#a88030"),
|
||||
("Sanbornite", "#e0d8c8"),
|
||||
("Fresnoite", "#e0d030"),
|
||||
("Celsian", "#d8d0c0"),
|
||||
("Hyalophane", "#d0c8b0"),
|
||||
("Harmotome", "#d8d0c0"),
|
||||
("Pectolite", "#d0e0e0"),
|
||||
("Okenite", "#f0e8e0"),
|
||||
("Gyrolite", "#e0e8d0"),
|
||||
("Tobermorite", "#d8d0c8"),
|
||||
("Xonotlite", "#e0d8d0"),
|
||||
("Thaumasite", "#e8e0d8"),
|
||||
("Ettringite", "#e8e838"),
|
||||
("Sturmanite", "#e8e030"),
|
||||
("Charlesite", "#e0e0c8"),
|
||||
("Afwillite", "#e0d8d0"),
|
||||
("Hillebrandite", "#e0d8c8"),
|
||||
("Foshagite", "#e0e0d0"),
|
||||
("Jennite", "#d8c8b8"),
|
||||
("Suolunite", "#d8d0c0"),
|
||||
("Rosenbuschite", "#c8a070"),
|
||||
("Eudialyte", "#c84860"),
|
||||
("Catapleiite", "#c8c0b0"),
|
||||
("Lorenzenite", "#584838"),
|
||||
("Ramsayite", "#685838"),
|
||||
("Lamprophyllite", "#a08030"),
|
||||
("Murmanite", "#907050"),
|
||||
("Lomonosovite", "#886040"),
|
||||
("Vuonnemite", "#b89050"),
|
||||
("Villiaumite", "#e8a030"),
|
||||
("Ussingite", "#c8a0b8"),
|
||||
("Chkalovite", "#d8d0c8"),
|
||||
("Tugtupite", "#e0586e"),
|
||||
("Sorensenite", "#d8d0c8"),
|
||||
("Tinguaite", "#586850"),
|
||||
("Cancrinite", "#e0c030"),
|
||||
("Vishnevite", "#c8c0a0"),
|
||||
("Davyne", "#d0c890"),
|
||||
("Microsommite", "#d0c890"),
|
||||
("Nosean", "#707888"),
|
||||
("Hackmanite", "#9870a0"),
|
||||
("Tugtupite", "#e0586e"),
|
||||
("Pargasite", "#386838"),
|
||||
("Edenite", "#507848"),
|
||||
("Kaersutite", "#483830"),
|
||||
("Richterite", "#586880"),
|
||||
("Winchite", "#607060"),
|
||||
("Barroisite", "#506860"),
|
||||
("Gedrite", "#606058"),
|
||||
("Anthophyllite", "#807860"),
|
||||
("Cummingtonite", "#787068"),
|
||||
("Grunerite", "#686058"),
|
||||
("Holmquistite", "#5868a0"),
|
||||
("Sapphirine", "#4060a8"),
|
||||
("Kornerupine", "#587848"),
|
||||
("Prismatine", "#586048"),
|
||||
("Boralsilite", "#d0c8b8"),
|
||||
("Werdingite", "#a8a098"),
|
||||
("Grandidierite", "#4898a8"),
|
||||
("Ominelite", "#404038"),
|
||||
("Serendibite", "#384838"),
|
||||
("Sinhalite", "#c0a870"),
|
||||
("Taafeite", "#c888c8"),
|
||||
("Musgravite", "#808878"),
|
||||
("Johachidolite", "#e0c070"),
|
||||
("Painite", "#a06040"),
|
||||
("Jeremejevite", "#a8c8e0"),
|
||||
("Poudretteite", "#e8c0d8"),
|
||||
("Benitoite", "#3858c8"),
|
||||
("Neptunite", "#383028"),
|
||||
("Joaquinite", "#a88030"),
|
||||
("Sanbornite", "#e0d8c8"),
|
||||
("Howlite", "#e8e0d8"),
|
||||
("Magnesite", "#ede6d6"),
|
||||
("Selenite (mineral)", "#f0eee0"),
|
||||
("Desert rose (crystal)", "#d8b890"),
|
||||
("Fulgurite", "#c8b898"),
|
||||
("Tektite", "#3a3a3a"),
|
||||
("Meteorite", "#686058"),
|
||||
("Pallasite", "#a09048"),
|
||||
("Kamacite", "#909090"),
|
||||
("Taenite", "#a0a0a0"),
|
||||
("Troilite", "#886838"),
|
||||
("Schreibersite", "#a0a098"),
|
||||
("Cohenite", "#686060"),
|
||||
("Moissanite", "#b8e8c8"),
|
||||
("Lonsdaleite", "#c8c8c0"),
|
||||
("Stishovite", "#d0d0c8"),
|
||||
("Coesite", "#c8c8c0"),
|
||||
("Seifertite", "#c0c0b8"),
|
||||
("Ringwoodite", "#5878c8"),
|
||||
("Bridgmanite", "#a0a098"),
|
||||
("Davemaoite", "#a8a0a0"),
|
||||
("Ice", "#e0f0f8"),
|
||||
("Dry ice", "#e8e8f0"),
|
||||
("Sal ammoniac", "#e0e0d8"),
|
||||
("Niter", "#e8e0d8"),
|
||||
("Natron", "#e0d8c8"),
|
||||
("Trona", "#d8d0c0"),
|
||||
("Thermonatrite", "#e0d8d0"),
|
||||
("Gaylussite", "#d8d0c0"),
|
||||
("Pirssonite", "#d0c8c0"),
|
||||
("Shortite", "#e0d838"),
|
||||
("Northupite", "#d8d0c0"),
|
||||
("Eitelite", "#d0c8c0"),
|
||||
("Bradleyite", "#c8c0b0"),
|
||||
("Tychite", "#c8c0b0"),
|
||||
("Schairerite", "#d0c8b8"),
|
||||
("Sulfohalite", "#d0c8b8"),
|
||||
("Kogarkoite", "#d0c8c0"),
|
||||
]
|
||||
|
||||
# Deduplicate by name (keep first occurrence)
|
||||
_seen = set()
|
||||
_deduped = []
|
||||
for name, color in MINERAL_LIST:
|
||||
key = name.lower()
|
||||
if key not in _seen:
|
||||
_seen.add(key)
|
||||
_deduped.append((name, color))
|
||||
MINERAL_LIST = _deduped
|
||||
|
||||
SESSION = None
|
||||
|
||||
|
||||
def get_session():
|
||||
global SESSION
|
||||
if SESSION is None:
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({
|
||||
'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
|
||||
})
|
||||
return SESSION
|
||||
|
||||
|
||||
def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
|
||||
"""Make a request with backoff on 429 errors, respecting Retry-After."""
|
||||
for attempt in range(max_retries):
|
||||
resp = session.get(url, params=params, timeout=timeout)
|
||||
if resp.status_code == 429:
|
||||
retry_after = resp.headers.get('Retry-After')
|
||||
if retry_after and retry_after.isdigit():
|
||||
wait = min(int(retry_after) + 1, 120) # Cap at 2 minutes
|
||||
else:
|
||||
wait = 10 * (2 ** attempt) # 10, 20, 40, 80, 160
|
||||
time.sleep(wait)
|
||||
continue
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
|
||||
|
||||
def get_wikipedia_page(title):
|
||||
"""Fetch parsed Wikipedia page via the API."""
|
||||
session = get_session()
|
||||
resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
|
||||
'action': 'parse',
|
||||
'page': title,
|
||||
'prop': 'text|images',
|
||||
'format': 'json',
|
||||
'redirects': 1,
|
||||
})
|
||||
data = resp.json()
|
||||
if 'error' in data:
|
||||
return None
|
||||
return data['parse']
|
||||
|
||||
|
||||
def get_image_urls(parse_data, limit=4):
|
||||
"""Get actual image URLs from the parsed page's image list."""
|
||||
session = get_session()
|
||||
images = parse_data.get('images', [])
|
||||
# Filter out icons, logos, SVGs
|
||||
good = [
|
||||
img for img in images
|
||||
if not any(skip in img.lower() for skip in [
|
||||
'icon', 'logo', 'symbol', 'flag', 'commons-logo', 'wiki',
|
||||
'question_mark', 'edit-clear', 'ambox', 'crystal_clear',
|
||||
'lock-', 'padlock', 'red_pencil', 'text-', 'globe_',
|
||||
'folder_', 'nuvola', 'gnome-', 'information', '.svg',
|
||||
'wiktionary', 'disambig', 'merge-', 'split-', 'portal-',
|
||||
])
|
||||
]
|
||||
if not good:
|
||||
good = [img for img in images if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
|
||||
|
||||
urls = []
|
||||
for img_name in good[:limit * 2]:
|
||||
try:
|
||||
resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
|
||||
'action': 'query',
|
||||
'titles': f'File:{img_name}',
|
||||
'prop': 'imageinfo',
|
||||
'iiprop': 'url|size',
|
||||
'iiurlwidth': 800,
|
||||
'format': 'json',
|
||||
}, timeout=15)
|
||||
pages = resp.json()['query']['pages']
|
||||
for page in pages.values():
|
||||
if 'imageinfo' in page:
|
||||
info = page['imageinfo'][0]
|
||||
thumb = info.get('thumburl', info.get('url', ''))
|
||||
if thumb:
|
||||
urls.append(thumb)
|
||||
if len(urls) >= limit:
|
||||
return urls
|
||||
except Exception:
|
||||
continue
|
||||
return urls
|
||||
|
||||
|
||||
def extract_infobox(soup):
|
||||
"""Extract key-value pairs from a mineral infobox."""
|
||||
info = {}
|
||||
table = soup.find('table', class_='infobox')
|
||||
if not table:
|
||||
return info
|
||||
|
||||
for row in table.find_all('tr'):
|
||||
th = row.find('th')
|
||||
td = row.find('td')
|
||||
if th and td:
|
||||
key = th.get_text(strip=True).lower()
|
||||
val = td.get_text(' ', strip=True)
|
||||
info[key] = val
|
||||
return info
|
||||
|
||||
|
||||
FIELD_MAPPINGS = {
|
||||
'formula': [
|
||||
'formula', 'chemical formula', 'idealformula',
|
||||
'formula(repeating unit)', 'chemical', 'composition',
|
||||
],
|
||||
'category': ['category', 'mineral class', 'classification', 'group'],
|
||||
'crystal_system': ['crystal system', 'crystalsystem', 'crystal class', 'system'],
|
||||
'mohs_hardness': ['mohs scalehardness', 'mohs scale hardness', 'hardness', 'mohs hardness'],
|
||||
'luster': ['luster', 'lustre', 'luster (mineralogy)'],
|
||||
'streak': ['streak', 'streak color'],
|
||||
'specific_gravity': ['specific gravity', 'density', 'specificgravity', 'relative density'],
|
||||
'color_description': ['color', 'colour', 'color/pleochroism'],
|
||||
}
|
||||
|
||||
|
||||
def match_field(info, candidates):
|
||||
"""Find the first matching key from candidates in the info dict."""
|
||||
for c in candidates:
|
||||
for key, val in info.items():
|
||||
if c in key:
|
||||
return val
|
||||
return ''
|
||||
|
||||
|
||||
def _clean_text(text):
|
||||
"""Remove citation marks and normalize whitespace."""
|
||||
text = re.sub(r'\[[\d,\s]+\]', '', text)
|
||||
text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'\[clarification needed\]', '', text, flags=re.IGNORECASE)
|
||||
# Normalize whitespace (collapse multiple spaces, fix space before punctuation)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
|
||||
text = re.sub(r'(\()\s+', r'\1', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _find_heading_wrapper(tag):
|
||||
"""Return the wrapper div if the heading is inside mw-heading, else the tag itself."""
|
||||
parent = tag.parent
|
||||
if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
|
||||
return parent
|
||||
return tag
|
||||
|
||||
|
||||
def extract_description(soup):
|
||||
"""Get the first few paragraphs of the article (before any section heading)."""
|
||||
paragraphs = []
|
||||
for p in soup.find_all('p'):
|
||||
text = p.get_text(' ', strip=True)
|
||||
if len(text) > 50:
|
||||
paragraphs.append(_clean_text(text))
|
||||
if len(paragraphs) >= 3:
|
||||
break
|
||||
return '\n\n'.join(paragraphs)
|
||||
|
||||
|
||||
def _collect_section_paragraphs(start_element, max_paras=2):
|
||||
"""Collect paragraphs after a heading element until the next heading."""
|
||||
parts = []
|
||||
heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
|
||||
sibling = start_element.find_next_sibling()
|
||||
while sibling:
|
||||
# Stop at next heading (div.mw-heading or bare h2/h3)
|
||||
if sibling.name in ['h2', 'h3']:
|
||||
break
|
||||
if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
|
||||
break
|
||||
if sibling.name == 'p':
|
||||
text = sibling.get_text(' ', strip=True)
|
||||
if len(text) > 30:
|
||||
parts.append(_clean_text(text))
|
||||
if len(parts) >= max_paras:
|
||||
break
|
||||
sibling = sibling.find_next_sibling()
|
||||
return parts
|
||||
|
||||
|
||||
def extract_history(soup):
|
||||
"""Try to find history/etymology section."""
|
||||
history_headers = ['history', 'etymology', 'discovery', 'naming', 'occurrence']
|
||||
|
||||
# Search both bare headings and headings inside mw-heading divs
|
||||
for header_tag in soup.find_all(['h2', 'h3']):
|
||||
header_text = header_tag.get_text(strip=True).lower()
|
||||
header_text = re.sub(r'\[edit\]$', '', header_text).strip()
|
||||
if any(h in header_text for h in history_headers):
|
||||
wrapper = _find_heading_wrapper(header_tag)
|
||||
parts = _collect_section_paragraphs(wrapper)
|
||||
if parts:
|
||||
return '\n\n'.join(parts)
|
||||
return ''
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Scrape mineral data from Wikipedia'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--limit', type=int, default=0, help='Max minerals to scrape (0 = all)')
|
||||
parser.add_argument('--dry-run', action='store_true', help='List minerals without saving')
|
||||
parser.add_argument('--skip-existing', action='store_true', help='Skip already-saved minerals')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
limit = options['limit']
|
||||
dry_run = options['dry_run']
|
||||
skip_existing = options['skip_existing']
|
||||
|
||||
minerals = MINERAL_LIST
|
||||
if limit:
|
||||
minerals = minerals[:limit]
|
||||
|
||||
self.stdout.write(f'Processing {len(minerals)} minerals...\n')
|
||||
|
||||
success = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for i, (name, color_hex) in enumerate(minerals, 1):
|
||||
display_name_check = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip()
|
||||
if skip_existing and Mineral.objects.filter(name=display_name_check).exists():
|
||||
self.stdout.write(f' [{i}/{len(minerals)}] SKIP {name} (already exists)')
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
if dry_run:
|
||||
self.stdout.write(f' [{i}/{len(minerals)}] {name} ({color_hex})')
|
||||
continue
|
||||
|
||||
self.stdout.write(f' [{i}/{len(minerals)}] Scraping {name}...', ending='')
|
||||
|
||||
try:
|
||||
parsed = get_wikipedia_page(name)
|
||||
if not parsed:
|
||||
self.stdout.write(self.style.WARNING(' NOT FOUND'))
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
html = parsed['text']['*']
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Skip actual disambiguation pages (they have the dmbox class)
|
||||
if soup.find('table', id='disambigbox') or soup.find('div', class_='dmbox'):
|
||||
self.stdout.write(self.style.WARNING(' DISAMBIGUATION - SKIPPED'))
|
||||
failed += 1
|
||||
continue
|
||||
info = extract_infobox(soup)
|
||||
|
||||
# Strip Wikipedia disambiguation suffixes from display name
|
||||
display_name = re.sub(r'\s*\([^)]*\)\s*$', '', name).strip()
|
||||
|
||||
image_urls = get_image_urls(parsed, limit=4)
|
||||
description = extract_description(soup)
|
||||
history = extract_history(soup)
|
||||
|
||||
mineral, created = Mineral.objects.update_or_create(
|
||||
name=display_name,
|
||||
defaults={
|
||||
'formula': match_field(info, FIELD_MAPPINGS['formula'])[:200],
|
||||
'category': match_field(info, FIELD_MAPPINGS['category'])[:200],
|
||||
'crystal_system': match_field(info, FIELD_MAPPINGS['crystal_system'])[:200],
|
||||
'mohs_hardness': match_field(info, FIELD_MAPPINGS['mohs_hardness'])[:50],
|
||||
'luster': match_field(info, FIELD_MAPPINGS['luster'])[:200],
|
||||
'streak': match_field(info, FIELD_MAPPINGS['streak'])[:200],
|
||||
'specific_gravity': match_field(info, FIELD_MAPPINGS['specific_gravity'])[:100],
|
||||
'color_description': match_field(info, FIELD_MAPPINGS['color_description'])[:300],
|
||||
'color_hex': color_hex,
|
||||
'description': description,
|
||||
'history': history,
|
||||
'image_urls': image_urls,
|
||||
'wikipedia_url': f'https://en.wikipedia.org/wiki/{name.replace(" ", "_")}',
|
||||
'day_of_year': i,
|
||||
},
|
||||
)
|
||||
|
||||
status = 'CREATED' if created else 'UPDATED'
|
||||
img_count = len(image_urls)
|
||||
self.stdout.write(self.style.SUCCESS(f' {status} ({img_count} images)'))
|
||||
success += 1
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f' ERROR: {e}'))
|
||||
failed += 1
|
||||
|
||||
# Be polite to Wikipedia — ~3s between minerals keeps us under rate limits
|
||||
time.sleep(3)
|
||||
|
||||
self.stdout.write(f'\nDone: {success} saved, {skipped} skipped, {failed} failed')
|
||||
Reference in New Issue
Block a user