Add Russian translations and pronunciation button
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
- Russian fields on Mineral model (name_ru, description_ru, history_ru, etc.) - scrape_minerals_ru management command fetches from Russian Wikipedia via langlinks - EN/RU toggle in header, saved to localStorage - Speaker button next to mineral name uses Web Speech API - Section headers and labels translated - Russian Wikipedia link in footer when in RU mode Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
244
dailystone/management/commands/scrape_minerals_ru.py
Normal file
244
dailystone/management/commands/scrape_minerals_ru.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""
|
||||
Scrape Russian Wikipedia translations for existing minerals.
|
||||
|
||||
Usage:
|
||||
python manage.py scrape_minerals_ru
|
||||
python manage.py scrape_minerals_ru --limit 10
|
||||
python manage.py scrape_minerals_ru --skip-existing
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from dailystone.models import Mineral
|
||||
|
||||
SESSION = None
|
||||
|
||||
|
||||
def get_session():
|
||||
global SESSION
|
||||
if SESSION is None:
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({
|
||||
'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
|
||||
})
|
||||
return SESSION
|
||||
|
||||
|
||||
def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
|
||||
for attempt in range(max_retries):
|
||||
resp = session.get(url, params=params, timeout=timeout)
|
||||
if resp.status_code == 429:
|
||||
retry_after = resp.headers.get('Retry-After')
|
||||
if retry_after and retry_after.isdigit():
|
||||
wait = min(int(retry_after) + 1, 120)
|
||||
else:
|
||||
wait = 10 * (2 ** attempt)
|
||||
time.sleep(wait)
|
||||
continue
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
|
||||
|
||||
def _clean_text(text):
|
||||
text = re.sub(r'\[[\d,\s]+\]', '', text)
|
||||
text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'\[уточнить\]', '', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
|
||||
text = re.sub(r'(\()\s+', r'\1', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_russian_title(english_title):
|
||||
"""Get the Russian Wikipedia article title via langlinks API."""
|
||||
session = get_session()
|
||||
resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
|
||||
'action': 'query',
|
||||
'titles': english_title,
|
||||
'prop': 'langlinks',
|
||||
'lllang': 'ru',
|
||||
'redirects': 1,
|
||||
'format': 'json',
|
||||
})
|
||||
data = resp.json()
|
||||
pages = data.get('query', {}).get('pages', {})
|
||||
for page_data in pages.values():
|
||||
langlinks = page_data.get('langlinks', [])
|
||||
if langlinks:
|
||||
return langlinks[0]['*']
|
||||
return None
|
||||
|
||||
|
||||
def get_russian_page(title):
|
||||
"""Fetch parsed Russian Wikipedia page."""
|
||||
session = get_session()
|
||||
resp = _request_with_backoff(session, 'https://ru.wikipedia.org/w/api.php', params={
|
||||
'action': 'parse',
|
||||
'page': title,
|
||||
'prop': 'text',
|
||||
'format': 'json',
|
||||
'redirects': 1,
|
||||
})
|
||||
data = resp.json()
|
||||
if 'error' in data:
|
||||
return None
|
||||
return data['parse']
|
||||
|
||||
|
||||
def _find_heading_wrapper(tag):
|
||||
parent = tag.parent
|
||||
if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
|
||||
return parent
|
||||
return tag
|
||||
|
||||
|
||||
def _collect_section_paragraphs(start_element, max_paras=2):
|
||||
parts = []
|
||||
heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
|
||||
sibling = start_element.find_next_sibling()
|
||||
while sibling:
|
||||
if sibling.name in ['h2', 'h3']:
|
||||
break
|
||||
if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
|
||||
break
|
||||
if sibling.name == 'p':
|
||||
text = sibling.get_text(' ', strip=True)
|
||||
if len(text) > 30:
|
||||
parts.append(_clean_text(text))
|
||||
if len(parts) >= max_paras:
|
||||
break
|
||||
sibling = sibling.find_next_sibling()
|
||||
return parts
|
||||
|
||||
|
||||
def extract_description(soup):
|
||||
paragraphs = []
|
||||
for p in soup.find_all('p'):
|
||||
text = p.get_text(' ', strip=True)
|
||||
if len(text) > 50:
|
||||
paragraphs.append(_clean_text(text))
|
||||
if len(paragraphs) >= 3:
|
||||
break
|
||||
return '\n\n'.join(paragraphs)
|
||||
|
||||
|
||||
def extract_history(soup):
|
||||
history_headers = [
|
||||
'история', 'этимология', 'открытие', 'происхождение названия',
|
||||
'название', 'нахождение', 'месторождения',
|
||||
]
|
||||
for header_tag in soup.find_all(['h2', 'h3']):
|
||||
header_text = header_tag.get_text(strip=True).lower()
|
||||
header_text = re.sub(r'\[править[^\]]*\]', '', header_text).strip()
|
||||
if any(h in header_text for h in history_headers):
|
||||
wrapper = _find_heading_wrapper(header_tag)
|
||||
parts = _collect_section_paragraphs(wrapper)
|
||||
if parts:
|
||||
return '\n\n'.join(parts)
|
||||
return ''
|
||||
|
||||
|
||||
def extract_infobox_color(soup):
|
||||
"""Try to extract color description from Russian infobox."""
|
||||
table = soup.find('table', class_='infobox')
|
||||
if not table:
|
||||
return ''
|
||||
for row in table.find_all('tr'):
|
||||
th = row.find('th')
|
||||
td = row.find('td')
|
||||
if th and td:
|
||||
key = th.get_text(strip=True).lower()
|
||||
if 'цвет' in key or 'окраска' in key:
|
||||
return td.get_text(' ', strip=True)
|
||||
return ''
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Scrape Russian Wikipedia translations for existing minerals'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--limit', type=int, default=0)
|
||||
parser.add_argument('--skip-existing', action='store_true',
|
||||
help='Skip minerals that already have Russian name')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
limit = options['limit']
|
||||
skip_existing = options['skip_existing']
|
||||
|
||||
minerals = Mineral.objects.all()
|
||||
if limit:
|
||||
minerals = minerals[:limit]
|
||||
|
||||
total = minerals.count()
|
||||
self.stdout.write(f'Processing {total} minerals...\n')
|
||||
|
||||
success = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for i, mineral in enumerate(minerals, 1):
|
||||
if skip_existing and mineral.name_ru:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
self.stdout.write(f'[{i}/{total}] {mineral.name}... ', ending='')
|
||||
|
||||
try:
|
||||
# Extract English Wikipedia title from URL or use name
|
||||
if mineral.wikipedia_url:
|
||||
en_title = mineral.wikipedia_url.split('/wiki/')[-1]
|
||||
en_title = requests.utils.unquote(en_title)
|
||||
else:
|
||||
en_title = mineral.name
|
||||
|
||||
# Find Russian article
|
||||
ru_title = get_russian_title(en_title)
|
||||
if not ru_title:
|
||||
self.stdout.write('no Russian article')
|
||||
failed += 1
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
# Fetch Russian page
|
||||
parse_data = get_russian_page(ru_title)
|
||||
if not parse_data:
|
||||
self.stdout.write(f'failed to fetch {ru_title}')
|
||||
failed += 1
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
html = parse_data['text']['*']
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Remove reference sections, navboxes, etc.
|
||||
for tag in soup.find_all(['table', 'div'], class_=['navbox', 'metadata']):
|
||||
tag.decompose()
|
||||
|
||||
mineral.name_ru = ru_title
|
||||
mineral.description_ru = extract_description(soup)
|
||||
mineral.history_ru = extract_history(soup)
|
||||
mineral.wikipedia_url_ru = f'https://ru.wikipedia.org/wiki/{requests.utils.quote(ru_title)}'
|
||||
|
||||
color = extract_infobox_color(soup)
|
||||
if color:
|
||||
mineral.color_description_ru = color[:300]
|
||||
|
||||
mineral.save()
|
||||
success += 1
|
||||
self.stdout.write(f'{ru_title}')
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(f'ERROR: {e}')
|
||||
failed += 1
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
self.stdout.write(
|
||||
f'\nDone: {success} translated, {skipped} skipped, {failed} failed'
|
||||
)
|
||||
Reference in New Issue
Block a user