Add Russian translations and pronunciation button
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

- Russian fields on Mineral model (name_ru, description_ru, history_ru, etc.)
- scrape_minerals_ru management command fetches from Russian Wikipedia via langlinks
- EN/RU toggle in header, saved to localStorage
- Speaker button next to mineral name uses Web Speech API
- Section headers and labels translated
- Russian Wikipedia link in footer when in RU mode

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-30 22:54:46 +03:00
parent 7220af6a60
commit 8a162afe2a
5 changed files with 488 additions and 44 deletions

View File

@@ -0,0 +1,244 @@
"""
Scrape Russian Wikipedia translations for existing minerals.
Usage:
python manage.py scrape_minerals_ru
python manage.py scrape_minerals_ru --limit 10
python manage.py scrape_minerals_ru --skip-existing
"""
import re
import time
import requests
from bs4 import BeautifulSoup
from django.core.management.base import BaseCommand
from dailystone.models import Mineral
SESSION = None
def get_session():
global SESSION
if SESSION is None:
SESSION = requests.Session()
SESSION.headers.update({
'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
})
return SESSION
def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
for attempt in range(max_retries):
resp = session.get(url, params=params, timeout=timeout)
if resp.status_code == 429:
retry_after = resp.headers.get('Retry-After')
if retry_after and retry_after.isdigit():
wait = min(int(retry_after) + 1, 120)
else:
wait = 10 * (2 ** attempt)
time.sleep(wait)
continue
resp.raise_for_status()
return resp
resp.raise_for_status()
return resp
def _clean_text(text):
text = re.sub(r'\[[\d,\s]+\]', '', text)
text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
text = re.sub(r'\[уточнить\]', '', text, flags=re.IGNORECASE)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
text = re.sub(r'(\()\s+', r'\1', text)
return text.strip()
def get_russian_title(english_title):
"""Get the Russian Wikipedia article title via langlinks API."""
session = get_session()
resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
'action': 'query',
'titles': english_title,
'prop': 'langlinks',
'lllang': 'ru',
'redirects': 1,
'format': 'json',
})
data = resp.json()
pages = data.get('query', {}).get('pages', {})
for page_data in pages.values():
langlinks = page_data.get('langlinks', [])
if langlinks:
return langlinks[0]['*']
return None
def get_russian_page(title):
"""Fetch parsed Russian Wikipedia page."""
session = get_session()
resp = _request_with_backoff(session, 'https://ru.wikipedia.org/w/api.php', params={
'action': 'parse',
'page': title,
'prop': 'text',
'format': 'json',
'redirects': 1,
})
data = resp.json()
if 'error' in data:
return None
return data['parse']
def _find_heading_wrapper(tag):
parent = tag.parent
if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
return parent
return tag
def _collect_section_paragraphs(start_element, max_paras=2):
parts = []
heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
sibling = start_element.find_next_sibling()
while sibling:
if sibling.name in ['h2', 'h3']:
break
if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
break
if sibling.name == 'p':
text = sibling.get_text(' ', strip=True)
if len(text) > 30:
parts.append(_clean_text(text))
if len(parts) >= max_paras:
break
sibling = sibling.find_next_sibling()
return parts
def extract_description(soup):
paragraphs = []
for p in soup.find_all('p'):
text = p.get_text(' ', strip=True)
if len(text) > 50:
paragraphs.append(_clean_text(text))
if len(paragraphs) >= 3:
break
return '\n\n'.join(paragraphs)
def extract_history(soup):
history_headers = [
'история', 'этимология', 'открытие', 'происхождение названия',
'название', 'нахождение', 'месторождения',
]
for header_tag in soup.find_all(['h2', 'h3']):
header_text = header_tag.get_text(strip=True).lower()
header_text = re.sub(r'\[править[^\]]*\]', '', header_text).strip()
if any(h in header_text for h in history_headers):
wrapper = _find_heading_wrapper(header_tag)
parts = _collect_section_paragraphs(wrapper)
if parts:
return '\n\n'.join(parts)
return ''
def extract_infobox_color(soup):
"""Try to extract color description from Russian infobox."""
table = soup.find('table', class_='infobox')
if not table:
return ''
for row in table.find_all('tr'):
th = row.find('th')
td = row.find('td')
if th and td:
key = th.get_text(strip=True).lower()
if 'цвет' in key or 'окраска' in key:
return td.get_text(' ', strip=True)
return ''
class Command(BaseCommand):
help = 'Scrape Russian Wikipedia translations for existing minerals'
def add_arguments(self, parser):
parser.add_argument('--limit', type=int, default=0)
parser.add_argument('--skip-existing', action='store_true',
help='Skip minerals that already have Russian name')
def handle(self, *args, **options):
limit = options['limit']
skip_existing = options['skip_existing']
minerals = Mineral.objects.all()
if limit:
minerals = minerals[:limit]
total = minerals.count()
self.stdout.write(f'Processing {total} minerals...\n')
success = 0
skipped = 0
failed = 0
for i, mineral in enumerate(minerals, 1):
if skip_existing and mineral.name_ru:
skipped += 1
continue
self.stdout.write(f'[{i}/{total}] {mineral.name}... ', ending='')
try:
# Extract English Wikipedia title from URL or use name
if mineral.wikipedia_url:
en_title = mineral.wikipedia_url.split('/wiki/')[-1]
en_title = requests.utils.unquote(en_title)
else:
en_title = mineral.name
# Find Russian article
ru_title = get_russian_title(en_title)
if not ru_title:
self.stdout.write('no Russian article')
failed += 1
time.sleep(2)
continue
# Fetch Russian page
parse_data = get_russian_page(ru_title)
if not parse_data:
self.stdout.write(f'failed to fetch {ru_title}')
failed += 1
time.sleep(2)
continue
html = parse_data['text']['*']
soup = BeautifulSoup(html, 'html.parser')
# Remove reference sections, navboxes, etc.
for tag in soup.find_all(['table', 'div'], class_=['navbox', 'metadata']):
tag.decompose()
mineral.name_ru = ru_title
mineral.description_ru = extract_description(soup)
mineral.history_ru = extract_history(soup)
mineral.wikipedia_url_ru = f'https://ru.wikipedia.org/wiki/{requests.utils.quote(ru_title)}'
color = extract_infobox_color(soup)
if color:
mineral.color_description_ru = color[:300]
mineral.save()
success += 1
self.stdout.write(f'{ru_title}')
except Exception as e:
self.stdout.write(f'ERROR: {e}')
failed += 1
time.sleep(3)
self.stdout.write(
f'\nDone: {success} translated, {skipped} skipped, {failed} failed'
)