Add Russian translations and pronunciation button

- Russian fields on Mineral model (name_ru, description_ru, history_ru, etc.) - scrape_minerals_ru management command fetches from Russian Wikipedia via langlinks - EN/RU toggle in header, saved to localStorage - Speaker button next to mineral name uses Web Speech API - Section headers and labels translated - Russian Wikipedia link in footer when in RU mode Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-30 22:54:46 +03:00
parent 7220af6a60
commit 8a162afe2a
5 changed files with 488 additions and 44 deletions
--- a/dailystone/management/commands/scrape_minerals_ru.py
+++ b/dailystone/management/commands/scrape_minerals_ru.py
@@ -0,0 +1,244 @@
+"""
+Scrape Russian Wikipedia translations for existing minerals.
+
+Usage:
+    python manage.py scrape_minerals_ru
+    python manage.py scrape_minerals_ru --limit 10
+    python manage.py scrape_minerals_ru --skip-existing
+"""
+import re
+import time
+
+import requests
+from bs4 import BeautifulSoup
+from django.core.management.base import BaseCommand
+
+from dailystone.models import Mineral
+
+SESSION = None
+
+
+def get_session():
+    global SESSION
+    if SESSION is None:
+        SESSION = requests.Session()
+        SESSION.headers.update({
+            'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
+        })
+    return SESSION
+
+
+def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
+    for attempt in range(max_retries):
+        resp = session.get(url, params=params, timeout=timeout)
+        if resp.status_code == 429:
+            retry_after = resp.headers.get('Retry-After')
+            if retry_after and retry_after.isdigit():
+                wait = min(int(retry_after) + 1, 120)
+            else:
+                wait = 10 * (2 ** attempt)
+            time.sleep(wait)
+            continue
+        resp.raise_for_status()
+        return resp
+    resp.raise_for_status()
+    return resp
+
+
+def _clean_text(text):
+    text = re.sub(r'\[[\d,\s]+\]', '', text)
+    text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'\[уточнить\]', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
+    text = re.sub(r'(\()\s+', r'\1', text)
+    return text.strip()
+
+
+def get_russian_title(english_title):
+    """Get the Russian Wikipedia article title via langlinks API."""
+    session = get_session()
+    resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
+        'action': 'query',
+        'titles': english_title,
+        'prop': 'langlinks',
+        'lllang': 'ru',
+        'redirects': 1,
+        'format': 'json',
+    })
+    data = resp.json()
+    pages = data.get('query', {}).get('pages', {})
+    for page_data in pages.values():
+        langlinks = page_data.get('langlinks', [])
+        if langlinks:
+            return langlinks[0]['*']
+    return None
+
+
+def get_russian_page(title):
+    """Fetch parsed Russian Wikipedia page."""
+    session = get_session()
+    resp = _request_with_backoff(session, 'https://ru.wikipedia.org/w/api.php', params={
+        'action': 'parse',
+        'page': title,
+        'prop': 'text',
+        'format': 'json',
+        'redirects': 1,
+    })
+    data = resp.json()
+    if 'error' in data:
+        return None
+    return data['parse']
+
+
+def _find_heading_wrapper(tag):
+    parent = tag.parent
+    if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
+        return parent
+    return tag
+
+
+def _collect_section_paragraphs(start_element, max_paras=2):
+    parts = []
+    heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
+    sibling = start_element.find_next_sibling()
+    while sibling:
+        if sibling.name in ['h2', 'h3']:
+            break
+        if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
+            break
+        if sibling.name == 'p':
+            text = sibling.get_text(' ', strip=True)
+            if len(text) > 30:
+                parts.append(_clean_text(text))
+                if len(parts) >= max_paras:
+                    break
+        sibling = sibling.find_next_sibling()
+    return parts
+
+
+def extract_description(soup):
+    paragraphs = []
+    for p in soup.find_all('p'):
+        text = p.get_text(' ', strip=True)
+        if len(text) > 50:
+            paragraphs.append(_clean_text(text))
+            if len(paragraphs) >= 3:
+                break
+    return '\n\n'.join(paragraphs)
+
+
+def extract_history(soup):
+    history_headers = [
+        'история', 'этимология', 'открытие', 'происхождение названия',
+        'название', 'нахождение', 'месторождения',
+    ]
+    for header_tag in soup.find_all(['h2', 'h3']):
+        header_text = header_tag.get_text(strip=True).lower()
+        header_text = re.sub(r'\[править[^\]]*\]', '', header_text).strip()
+        if any(h in header_text for h in history_headers):
+            wrapper = _find_heading_wrapper(header_tag)
+            parts = _collect_section_paragraphs(wrapper)
+            if parts:
+                return '\n\n'.join(parts)
+    return ''
+
+
+def extract_infobox_color(soup):
+    """Try to extract color description from Russian infobox."""
+    table = soup.find('table', class_='infobox')
+    if not table:
+        return ''
+    for row in table.find_all('tr'):
+        th = row.find('th')
+        td = row.find('td')
+        if th and td:
+            key = th.get_text(strip=True).lower()
+            if 'цвет' in key or 'окраска' in key:
+                return td.get_text(' ', strip=True)
+    return ''
+
+
+class Command(BaseCommand):
+    help = 'Scrape Russian Wikipedia translations for existing minerals'
+
+    def add_arguments(self, parser):
+        parser.add_argument('--limit', type=int, default=0)
+        parser.add_argument('--skip-existing', action='store_true',
+                            help='Skip minerals that already have Russian name')
+
+    def handle(self, *args, **options):
+        limit = options['limit']
+        skip_existing = options['skip_existing']
+
+        minerals = Mineral.objects.all()
+        if limit:
+            minerals = minerals[:limit]
+
+        total = minerals.count()
+        self.stdout.write(f'Processing {total} minerals...\n')
+
+        success = 0
+        skipped = 0
+        failed = 0
+
+        for i, mineral in enumerate(minerals, 1):
+            if skip_existing and mineral.name_ru:
+                skipped += 1
+                continue
+
+            self.stdout.write(f'[{i}/{total}] {mineral.name}... ', ending='')
+
+            try:
+                # Extract English Wikipedia title from URL or use name
+                if mineral.wikipedia_url:
+                    en_title = mineral.wikipedia_url.split('/wiki/')[-1]
+                    en_title = requests.utils.unquote(en_title)
+                else:
+                    en_title = mineral.name
+
+                # Find Russian article
+                ru_title = get_russian_title(en_title)
+                if not ru_title:
+                    self.stdout.write('no Russian article')
+                    failed += 1
+                    time.sleep(2)
+                    continue
+
+                # Fetch Russian page
+                parse_data = get_russian_page(ru_title)
+                if not parse_data:
+                    self.stdout.write(f'failed to fetch {ru_title}')
+                    failed += 1
+                    time.sleep(2)
+                    continue
+
+                html = parse_data['text']['*']
+                soup = BeautifulSoup(html, 'html.parser')
+
+                # Remove reference sections, navboxes, etc.
+                for tag in soup.find_all(['table', 'div'], class_=['navbox', 'metadata']):
+                    tag.decompose()
+
+                mineral.name_ru = ru_title
+                mineral.description_ru = extract_description(soup)
+                mineral.history_ru = extract_history(soup)
+                mineral.wikipedia_url_ru = f'https://ru.wikipedia.org/wiki/{requests.utils.quote(ru_title)}'
+
+                color = extract_infobox_color(soup)
+                if color:
+                    mineral.color_description_ru = color[:300]
+
+                mineral.save()
+                success += 1
+                self.stdout.write(f'{ru_title}')
+
+            except Exception as e:
+                self.stdout.write(f'ERROR: {e}')
+                failed += 1
+
+            time.sleep(3)
+
+        self.stdout.write(
+            f'\nDone: {success} translated, {skipped} skipped, {failed} failed'
+        )