k-boris-website/dailystone/management/commands/scrape_minerals_ru.py

"""
Scrape Russian Wikipedia translations for existing minerals.

Usage:
    python manage.py scrape_minerals_ru
    python manage.py scrape_minerals_ru --limit 10
    python manage.py scrape_minerals_ru --skip-existing
"""
import re
import time

import requests
from bs4 import BeautifulSoup
from django.core.management.base import BaseCommand

from dailystone.models import Mineral

SESSION = None


def get_session():
    global SESSION
    if SESSION is None:
        SESSION = requests.Session()
        SESSION.headers.update({
            'User-Agent': 'DailyStoneBot/1.0 (k-boris.tech; educational mineral wiki)'
        })
    return SESSION


def _request_with_backoff(session, url, params, timeout=30, max_retries=5):
    for attempt in range(max_retries):
        resp = session.get(url, params=params, timeout=timeout)
        if resp.status_code == 429:
            retry_after = resp.headers.get('Retry-After')
            if retry_after and retry_after.isdigit():
                wait = min(int(retry_after) + 1, 120)
            else:
                wait = 10 * (2 ** attempt)
            time.sleep(wait)
            continue
        resp.raise_for_status()
        return resp
    resp.raise_for_status()
    return resp


def _clean_text(text):
    text = re.sub(r'\[[\d,\s]+\]', '', text)
    text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[уточнить\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
    text = re.sub(r'(\()\s+', r'\1', text)
    return text.strip()


def get_russian_title(english_title):
    """Get the Russian Wikipedia article title via langlinks API."""
    session = get_session()
    resp = _request_with_backoff(session, 'https://en.wikipedia.org/w/api.php', params={
        'action': 'query',
        'titles': english_title,
        'prop': 'langlinks',
        'lllang': 'ru',
        'redirects': 1,
        'format': 'json',
    })
    data = resp.json()
    pages = data.get('query', {}).get('pages', {})
    for page_data in pages.values():
        langlinks = page_data.get('langlinks', [])
        if langlinks:
            return langlinks[0]['*']
    return None


def get_russian_page(title):
    """Fetch parsed Russian Wikipedia page."""
    session = get_session()
    resp = _request_with_backoff(session, 'https://ru.wikipedia.org/w/api.php', params={
        'action': 'parse',
        'page': title,
        'prop': 'text',
        'format': 'json',
        'redirects': 1,
    })
    data = resp.json()
    if 'error' in data:
        return None
    return data['parse']


def _find_heading_wrapper(tag):
    parent = tag.parent
    if parent and parent.name == 'div' and 'mw-heading' in (parent.get('class') or []):
        return parent
    return tag


def _collect_section_paragraphs(start_element, max_paras=2):
    parts = []
    heading_classes = {'mw-heading', 'mw-heading2', 'mw-heading3'}
    sibling = start_element.find_next_sibling()
    while sibling:
        if sibling.name in ['h2', 'h3']:
            break
        if sibling.name == 'div' and heading_classes & set(sibling.get('class') or []):
            break
        if sibling.name == 'p':
            text = sibling.get_text(' ', strip=True)
            if len(text) > 30:
                parts.append(_clean_text(text))
                if len(parts) >= max_paras:
                    break
        sibling = sibling.find_next_sibling()
    return parts


def extract_description(soup):
    paragraphs = []
    for p in soup.find_all('p'):
        text = p.get_text(' ', strip=True)
        if len(text) > 50:
            paragraphs.append(_clean_text(text))
            if len(paragraphs) >= 3:
                break
    return '\n\n'.join(paragraphs)


def extract_history(soup):
    history_headers = [
        'история', 'этимология', 'открытие', 'происхождение названия',
        'название', 'нахождение', 'месторождения',
    ]
    for header_tag in soup.find_all(['h2', 'h3']):
        header_text = header_tag.get_text(strip=True).lower()
        header_text = re.sub(r'\[править[^\]]*\]', '', header_text).strip()
        if any(h in header_text for h in history_headers):
            wrapper = _find_heading_wrapper(header_tag)
            parts = _collect_section_paragraphs(wrapper)
            if parts:
                return '\n\n'.join(parts)
    return ''


def extract_infobox_color(soup):
    """Try to extract color description from Russian infobox."""
    table = soup.find('table', class_='infobox')
    if not table:
        return ''
    for row in table.find_all('tr'):
        th = row.find('th')
        td = row.find('td')
        if th and td:
            key = th.get_text(strip=True).lower()
            if 'цвет' in key or 'окраска' in key:
                return td.get_text(' ', strip=True)
    return ''


class Command(BaseCommand):
    help = 'Scrape Russian Wikipedia translations for existing minerals'

    def add_arguments(self, parser):
        parser.add_argument('--limit', type=int, default=0)
        parser.add_argument('--skip-existing', action='store_true',
                            help='Skip minerals that already have Russian name')

    def handle(self, *args, **options):
        limit = options['limit']
        skip_existing = options['skip_existing']

        minerals = Mineral.objects.all()
        if limit:
            minerals = minerals[:limit]

        total = minerals.count()
        self.stdout.write(f'Processing {total} minerals...\n')

        success = 0
        skipped = 0
        failed = 0

        for i, mineral in enumerate(minerals, 1):
            if skip_existing and mineral.name_ru:
                skipped += 1
                continue

            self.stdout.write(f'[{i}/{total}] {mineral.name}... ', ending='')

            try:
                # Extract English Wikipedia title from URL or use name
                if mineral.wikipedia_url:
                    en_title = mineral.wikipedia_url.split('/wiki/')[-1]
                    en_title = requests.utils.unquote(en_title)
                else:
                    en_title = mineral.name

                # Find Russian article
                ru_title = get_russian_title(en_title)
                if not ru_title:
                    self.stdout.write('no Russian article')
                    failed += 1
                    time.sleep(2)
                    continue

                # Fetch Russian page
                parse_data = get_russian_page(ru_title)
                if not parse_data:
                    self.stdout.write(f'failed to fetch {ru_title}')
                    failed += 1
                    time.sleep(2)
                    continue

                html = parse_data['text']['*']
                soup = BeautifulSoup(html, 'html.parser')

                # Remove reference sections, navboxes, etc.
                for tag in soup.find_all(['table', 'div'], class_=['navbox', 'metadata']):
                    tag.decompose()

                mineral.name_ru = ru_title
                mineral.description_ru = extract_description(soup)
                mineral.history_ru = extract_history(soup)
                mineral.wikipedia_url_ru = f'https://ru.wikipedia.org/wiki/{requests.utils.quote(ru_title)}'

                color = extract_infobox_color(soup)
                if color:
                    mineral.color_description_ru = color[:300]

                mineral.save()
                success += 1
                self.stdout.write(f'{ru_title}')

            except Exception as e:
                self.stdout.write(f'ERROR: {e}')
                failed += 1

            time.sleep(3)

        self.stdout.write(
            f'\nDone: {success} translated, {skipped} skipped, {failed} failed'
        )