""" Scrape Russian Wikipedia translations for existing minerals. Usage: python manage.py scrape_minerals_ru python manage.py scrape_minerals_ru --limit 10 python manage.py scrape_minerals_ru --skip-existing """ import re import time import requests from bs4 import BeautifulSoup from django.core.management.base import BaseCommand from dailystone.models import Mineral SESSION = None def get_session(): global SESSION if SESSION is None: SESSION = requests.Session() SESSION.headers.update({ 'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)' }) return SESSION def _request_with_backoff(session, url, params, timeout=30, max_retries=5): for attempt in range(max_retries): resp = session.get(url, params=params, timeout=timeout) if resp.status_code == 429: retry_after = resp.headers.get('Retry-After') if retry_after and retry_after.isdigit(): wait = min(int(retry_after) + 1, 120) else: wait = 10 * (2 ** attempt) time.sleep(wait) continue resp.raise_for_status() return resp resp.raise_for_status() return resp def _clean_text(text): text = re.sub(r'\[[\d,\s]+\]', '', text) text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE) text = re.sub(r'\[уточнить\]', '', text, flags=re.IGNORECASE) text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s+([.,;:!?)])', r'\1', text) text = re.sub(r'(\()\s+', r'\1', text) return text.strip() def get_russian_title(english_title): """Get the Russian Wikipedia article title via langlinks API.""" session = get_session() resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={ 'action': 'query', 'titles': english_title, 'prop': 'langlinks', 'lllang': 'ru', 'redirects': 1, 'format': 'json', }) data = resp.json() pages = data.get('query', {}).get('pages', {}) for page_data in pages.values(): langlinks = page_data.get('langlinks', []) if langlinks: return langlinks[0]['*'] return None def get_russian_page(title): """Fetch parsed Russian Wikipedia page.""" session = get_session() resp = _request_with_backoff(session, 'https://ru.wikipedia.org/w/api.php', params={ 'action': 'parse', 'page': title, 'prop': 'text', 'format': 'json', 'redirects': 1, }) data = resp.json() if 'error' in data: return None return data['parse'] def _find_heading_wrapper(tag): parent = tag.parent if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []): return parent return tag def _collect_section_paragraphs(start_element, max_paras=2): parts = [] heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'} sibling = start_element.find_next_sibling() while sibling: if sibling.name in ['h2', 'h3']: break if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []): break if sibling.name == 'p': text = sibling.get_text(' ', strip=True) if len(text) > 30: parts.append(_clean_text(text)) if len(parts) >= max_paras: break sibling = sibling.find_next_sibling() return parts def extract_description(soup): paragraphs = [] for p in soup.find_all('p'): text = p.get_text(' ', strip=True) if len(text) > 50: paragraphs.append(_clean_text(text)) if len(paragraphs) >= 3: break return '\n\n'.join(paragraphs) def extract_history(soup): history_headers = [ 'история', 'этимология', 'открытие', 'происхождение названия', 'название', 'нахождение', 'месторождения', ] for header_tag in soup.find_all(['h2', 'h3']): header_text = header_tag.get_text(strip=True).lower() header_text = re.sub(r'\[править[^\]]*\]', '', header_text).strip() if any(h in header_text for h in history_headers): wrapper = _find_heading_wrapper(header_tag) parts = _collect_section_paragraphs(wrapper) if parts: return '\n\n'.join(parts) return '' def extract_infobox_color(soup): """Try to extract color description from Russian infobox.""" table = soup.find('table', class_='infobox') if not table: return '' for row in table.find_all('tr'): th = row.find('th') td = row.find('td') if th and td: key = th.get_text(strip=True).lower() if 'цвет' in key or 'окраска' in key: return td.get_text(' ', strip=True) return '' class Command(BaseCommand): help = 'Scrape Russian Wikipedia translations for existing minerals' def add_arguments(self, parser): parser.add_argument('--limit', type=int, default=0) parser.add_argument('--skip-existing', action='store_true', help='Skip minerals that already have Russian name') def handle(self, *args, **options): limit = options['limit'] skip_existing = options['skip_existing'] minerals = Mineral.objects.all() if limit: minerals = minerals[:limit] total = minerals.count() self.stdout.write(f'Processing {total} minerals...\n') success = 0 skipped = 0 failed = 0 for i, mineral in enumerate(minerals, 1): if skip_existing and mineral.name_ru: skipped += 1 continue self.stdout.write(f'[{i}/{total}] {mineral.name}... ', ending='') try: # Extract English Wikipedia title from URL or use name if mineral.wikipedia_url: en_title = mineral.wikipedia_url.split('/wiki/')[-1] en_title = requests.utils.unquote(en_title) else: en_title = mineral.name # Find Russian article ru_title = get_russian_title(en_title) if not ru_title: self.stdout.write('no Russian article') failed += 1 time.sleep(2) continue # Fetch Russian page parse_data = get_russian_page(ru_title) if not parse_data: self.stdout.write(f'failed to fetch {ru_title}') failed += 1 time.sleep(2) continue html = parse_data['text']['*'] soup = BeautifulSoup(html, 'html.parser') # Remove reference sections, navboxes, etc. for tag in soup.find_all(['table', 'div'], class_=['navbox', 'metadata']): tag.decompose() mineral.name_ru = ru_title mineral.description_ru = extract_description(soup) mineral.history_ru = extract_history(soup) mineral.wikipedia_url_ru = f'https://ru.wikipedia.org/wiki/{requests.utils.quote(ru_title)}' color = extract_infobox_color(soup) if color: mineral.color_description_ru = color[:300] mineral.save() success += 1 self.stdout.write(f'{ru_title}') except Exception as e: self.stdout.write(f'ERROR: {e}') failed += 1 time.sleep(3) self.stdout.write( f'\nDone: {success} translated, {skipped} skipped, {failed} failed' )