All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
- Russian fields on Mineral model (name_ru, description_ru, history_ru, etc.) - scrape_minerals_ru management command fetches from Russian Wikipedia via langlinks - EN/RU toggle in header, saved to localStorage - Speaker button next to mineral name uses Web Speech API - Section headers and labels translated - Russian Wikipedia link in footer when in RU mode Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
245 lines
7.8 KiB
Python
245 lines
7.8 KiB
Python
"""
|
|
Scrape Russian Wikipedia translations for existing minerals.
|
|
|
|
Usage:
|
|
python manage.py scrape_minerals_ru
|
|
python manage.py scrape_minerals_ru --limit 10
|
|
python manage.py scrape_minerals_ru --skip-existing
|
|
"""
|
|
import re
|
|
import time
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from django.core.management.base import BaseCommand
|
|
|
|
from dailystone.models import Mineral
|
|
|
|
SESSION = None
|
|
|
|
|
|
def get_session():
|
|
global SESSION
|
|
if SESSION is None:
|
|
SESSION = requests.Session()
|
|
SESSION.headers.update({
|
|
'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
|
|
})
|
|
return SESSION
|
|
|
|
|
|
def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
|
|
for attempt in range(max_retries):
|
|
resp = session.get(url, params=params, timeout=timeout)
|
|
if resp.status_code == 429:
|
|
retry_after = resp.headers.get('Retry-After')
|
|
if retry_after and retry_after.isdigit():
|
|
wait = min(int(retry_after) + 1, 120)
|
|
else:
|
|
wait = 10 * (2 ** attempt)
|
|
time.sleep(wait)
|
|
continue
|
|
resp.raise_for_status()
|
|
return resp
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
def _clean_text(text):
|
|
text = re.sub(r'\[[\d,\s]+\]', '', text)
|
|
text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
|
|
text = re.sub(r'\[уточнить\]', '', text, flags=re.IGNORECASE)
|
|
text = re.sub(r'\s+', ' ', text)
|
|
text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
|
|
text = re.sub(r'(\()\s+', r'\1', text)
|
|
return text.strip()
|
|
|
|
|
|
def get_russian_title(english_title):
|
|
"""Get the Russian Wikipedia article title via langlinks API."""
|
|
session = get_session()
|
|
resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
|
|
'action': 'query',
|
|
'titles': english_title,
|
|
'prop': 'langlinks',
|
|
'lllang': 'ru',
|
|
'redirects': 1,
|
|
'format': 'json',
|
|
})
|
|
data = resp.json()
|
|
pages = data.get('query', {}).get('pages', {})
|
|
for page_data in pages.values():
|
|
langlinks = page_data.get('langlinks', [])
|
|
if langlinks:
|
|
return langlinks[0]['*']
|
|
return None
|
|
|
|
|
|
def get_russian_page(title):
|
|
"""Fetch parsed Russian Wikipedia page."""
|
|
session = get_session()
|
|
resp = _request_with_backoff(session, 'https://ru.wikipedia.org/w/api.php', params={
|
|
'action': 'parse',
|
|
'page': title,
|
|
'prop': 'text',
|
|
'format': 'json',
|
|
'redirects': 1,
|
|
})
|
|
data = resp.json()
|
|
if 'error' in data:
|
|
return None
|
|
return data['parse']
|
|
|
|
|
|
def _find_heading_wrapper(tag):
|
|
parent = tag.parent
|
|
if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
|
|
return parent
|
|
return tag
|
|
|
|
|
|
def _collect_section_paragraphs(start_element, max_paras=2):
|
|
parts = []
|
|
heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
|
|
sibling = start_element.find_next_sibling()
|
|
while sibling:
|
|
if sibling.name in ['h2', 'h3']:
|
|
break
|
|
if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
|
|
break
|
|
if sibling.name == 'p':
|
|
text = sibling.get_text(' ', strip=True)
|
|
if len(text) > 30:
|
|
parts.append(_clean_text(text))
|
|
if len(parts) >= max_paras:
|
|
break
|
|
sibling = sibling.find_next_sibling()
|
|
return parts
|
|
|
|
|
|
def extract_description(soup):
|
|
paragraphs = []
|
|
for p in soup.find_all('p'):
|
|
text = p.get_text(' ', strip=True)
|
|
if len(text) > 50:
|
|
paragraphs.append(_clean_text(text))
|
|
if len(paragraphs) >= 3:
|
|
break
|
|
return '\n\n'.join(paragraphs)
|
|
|
|
|
|
def extract_history(soup):
|
|
history_headers = [
|
|
'история', 'этимология', 'открытие', 'происхождение названия',
|
|
'название', 'нахождение', 'месторождения',
|
|
]
|
|
for header_tag in soup.find_all(['h2', 'h3']):
|
|
header_text = header_tag.get_text(strip=True).lower()
|
|
header_text = re.sub(r'\[править[^\]]*\]', '', header_text).strip()
|
|
if any(h in header_text for h in history_headers):
|
|
wrapper = _find_heading_wrapper(header_tag)
|
|
parts = _collect_section_paragraphs(wrapper)
|
|
if parts:
|
|
return '\n\n'.join(parts)
|
|
return ''
|
|
|
|
|
|
def extract_infobox_color(soup):
|
|
"""Try to extract color description from Russian infobox."""
|
|
table = soup.find('table', class_='infobox')
|
|
if not table:
|
|
return ''
|
|
for row in table.find_all('tr'):
|
|
th = row.find('th')
|
|
td = row.find('td')
|
|
if th and td:
|
|
key = th.get_text(strip=True).lower()
|
|
if 'цвет' in key or 'окраска' in key:
|
|
return td.get_text(' ', strip=True)
|
|
return ''
|
|
|
|
|
|
class Command(BaseCommand):
|
|
help = 'Scrape Russian Wikipedia translations for existing minerals'
|
|
|
|
def add_arguments(self, parser):
|
|
parser.add_argument('--limit', type=int, default=0)
|
|
parser.add_argument('--skip-existing', action='store_true',
|
|
help='Skip minerals that already have Russian name')
|
|
|
|
def handle(self, *args, **options):
|
|
limit = options['limit']
|
|
skip_existing = options['skip_existing']
|
|
|
|
minerals = Mineral.objects.all()
|
|
if limit:
|
|
minerals = minerals[:limit]
|
|
|
|
total = minerals.count()
|
|
self.stdout.write(f'Processing {total} minerals...\n')
|
|
|
|
success = 0
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
for i, mineral in enumerate(minerals, 1):
|
|
if skip_existing and mineral.name_ru:
|
|
skipped += 1
|
|
continue
|
|
|
|
self.stdout.write(f'[{i}/{total}] {mineral.name}... ', ending='')
|
|
|
|
try:
|
|
# Extract English Wikipedia title from URL or use name
|
|
if mineral.wikipedia_url:
|
|
en_title = mineral.wikipedia_url.split('/wiki/')[-1]
|
|
en_title = requests.utils.unquote(en_title)
|
|
else:
|
|
en_title = mineral.name
|
|
|
|
# Find Russian article
|
|
ru_title = get_russian_title(en_title)
|
|
if not ru_title:
|
|
self.stdout.write('no Russian article')
|
|
failed += 1
|
|
time.sleep(2)
|
|
continue
|
|
|
|
# Fetch Russian page
|
|
parse_data = get_russian_page(ru_title)
|
|
if not parse_data:
|
|
self.stdout.write(f'failed to fetch {ru_title}')
|
|
failed += 1
|
|
time.sleep(2)
|
|
continue
|
|
|
|
html = parse_data['text']['*']
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Remove reference sections, navboxes, etc.
|
|
for tag in soup.find_all(['table', 'div'], class_=['navbox', 'metadata']):
|
|
tag.decompose()
|
|
|
|
mineral.name_ru = ru_title
|
|
mineral.description_ru = extract_description(soup)
|
|
mineral.history_ru = extract_history(soup)
|
|
mineral.wikipedia_url_ru = f'https://ru.wikipedia.org/wiki/{requests.utils.quote(ru_title)}'
|
|
|
|
color = extract_infobox_color(soup)
|
|
if color:
|
|
mineral.color_description_ru = color[:300]
|
|
|
|
mineral.save()
|
|
success += 1
|
|
self.stdout.write(f'{ru_title}')
|
|
|
|
except Exception as e:
|
|
self.stdout.write(f'ERROR: {e}')
|
|
failed += 1
|
|
|
|
time.sleep(3)
|
|
|
|
self.stdout.write(
|
|
f'\nDone: {success} translated, {skipped} skipped, {failed} failed'
|
|
)
|