Files
renamer/renamer/utils/language_utils.py
sHa c5fbd367fc Add rename service and utility modules for file renaming operations
- Implemented RenameService for handling file renaming with features like name validation, proposed name generation, conflict detection, and atomic rename operations.
- Created utility modules for language code extraction, regex pattern matching, and frame class matching to centralize common functionalities.
- Added comprehensive logging for error handling and debugging across all new modules.
2025-12-31 03:13:26 +00:00

333 lines
11 KiB
Python

"""Language code extraction and conversion utilities.
This module provides centralized logic for extracting and converting language codes
from filenames and metadata. This eliminates the ~150+ lines of duplicated code
between FilenameExtractor and MediaInfoExtractor.
"""
import logging
import re
from typing import Optional
import langcodes
logger = logging.getLogger(__name__)
class LanguageCodeExtractor:
"""Shared language code extraction logic.
This class centralizes all language code detection and conversion logic,
eliminating duplication across multiple extractors.
Example:
>>> extractor = LanguageCodeExtractor()
>>> langs = extractor.extract_from_brackets("[2xUKR_ENG]")
>>> print(langs) # ['ukr', 'ukr', 'eng']
"""
# Comprehensive set of known ISO 639-1/639-2/639-3 language codes
KNOWN_CODES = {
# Most common codes
'eng', 'ukr', 'rus', 'fra', 'deu', 'spa', 'ita', 'por', 'nor', 'swe',
'dan', 'fin', 'pol', 'cze', 'hun', 'tur', 'ara', 'heb', 'hin', 'jpn',
'kor', 'chi', 'tha', 'vie', 'und',
# European languages
'dut', 'nld', 'bel', 'bul', 'hrv', 'ces', 'est', 'ell', 'ind',
'lav', 'lit', 'mkd', 'ron', 'slk', 'slv', 'srp', 'zho',
# South Asian languages
'arb', 'ben', 'mar', 'tam', 'tel', 'urd', 'guj', 'kan', 'mal', 'ori',
'pan', 'asm', 'mai', 'bho', 'nep', 'sin', 'san', 'tib', 'mon',
# Central Asian languages
'kaz', 'uzb', 'kir', 'tuk', 'aze', 'kat', 'hye', 'geo',
# Balkan languages
'sqi', 'bos', 'alb', 'mol',
# Nordic languages
'isl', 'fao',
# Other Asian languages
'per', 'kur', 'pus', 'div', 'lao', 'khm', 'mya', 'msa',
'yue', 'wuu', 'nan', 'hak', 'gan', 'hsn',
# Various other codes
'awa', 'mag',
}
# Language codes that are allowed in title case (to avoid false positives)
ALLOWED_TITLE_CASE = {
'ukr', 'nor', 'eng', 'rus', 'fra', 'deu', 'spa', 'ita', 'por', 'swe',
'dan', 'fin', 'pol', 'cze', 'hun', 'tur', 'ara', 'heb', 'hin', 'jpn',
'kor', 'chi', 'tha', 'vie', 'und'
}
# Words to skip (common English words, file extensions, quality indicators)
SKIP_WORDS = {
# Common English words
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his',
'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'way', 'who',
'boy', 'did', 'let', 'put', 'say', 'she', 'too', 'use',
# File extensions
'avi', 'mkv', 'mp4', 'mpg', 'mov', 'wmv', 'flv', 'webm', 'm4v',
'm2ts', 'ts', 'vob', 'iso', 'img',
# Quality/resolution indicators
'sd', 'hd', 'lq', 'qhd', 'uhd', 'p', 'i', 'hdr', 'sdr', '4k', '8k',
'2160p', '1080p', '720p', '480p', '360p', '240p', '144p',
# Source/encoding indicators
'web', 'dl', 'rip', 'bluray', 'dvd', 'hdtv', 'bdrip', 'dvdrip',
'xvid', 'divx', 'h264', 'h265', 'x264', 'x265', 'hevc', 'avc',
# Audio codecs
'ma', 'atmos', 'dts', 'aac', 'ac3', 'mp3', 'flac', 'wav', 'wma',
'ogg', 'opus',
# Subtitle indicator
'sub', 'subs', 'subtitle',
}
def __init__(self):
"""Initialize the language code extractor."""
pass
def extract_from_brackets(self, text: str) -> list[str]:
"""Extract language codes from bracketed content.
Handles patterns like:
- [UKR_ENG] → ['ukr', 'eng']
- [2xUKR_ENG] → ['ukr', 'ukr', 'eng']
- [4xUKR,ENG] → ['ukr', 'ukr', 'ukr', 'ukr', 'eng']
Args:
text: Text containing bracketed language codes
Returns:
List of ISO 639-3 language codes (3-letter)
Example:
>>> extractor = LanguageCodeExtractor()
>>> extractor.extract_from_brackets("[2xUKR_ENG]")
['ukr', 'ukr', 'eng']
"""
langs = []
# Find all bracketed content
bracket_pattern = r'\[([^\]]+)\]'
brackets = re.findall(bracket_pattern, text)
for bracket in brackets:
bracket_lower = bracket.lower()
# Skip brackets containing movie database patterns
if any(db in bracket_lower for db in ['imdb', 'tmdb', 'tvdb']):
continue
# Parse items separated by commas or underscores
items = re.split(r'[,_]', bracket)
items = [item.strip() for item in items]
for item in items:
# Skip empty items or too short
if not item or len(item) < 2:
continue
item_lower = item.lower()
# Skip subtitle indicators
if item_lower in self.SKIP_WORDS:
continue
# Pattern: optional number + optional 'x' + language code
lang_match = re.search(r'(?:(\d+)x?)?([a-z]{2,3})$', item_lower)
if lang_match:
count = int(lang_match.group(1)) if lang_match.group(1) else 1
lang_code = lang_match.group(2)
# Skip quality/resolution indicators
if lang_code in self.SKIP_WORDS:
continue
# Validate prefix (only digits and 'x' allowed)
prefix = item_lower[:-len(lang_code)]
if not re.match(r'^(?:\d+x?)?$', prefix):
continue
# Convert to ISO 639-3 code
iso3_code = self._convert_to_iso3(lang_code)
if iso3_code:
langs.extend([iso3_code] * count)
return langs
def extract_standalone(self, text: str) -> list[str]:
"""Extract standalone language codes from text.
Looks for language codes outside of brackets in various formats:
- Uppercase: ENG, UKR, NOR
- Title case: Ukr, Nor, Eng
- Lowercase: ukr, nor, eng
- Dot-separated: .ukr. .eng.
Args:
text: Text to extract language codes from
Returns:
List of ISO 639-3 language codes (3-letter)
Example:
>>> extractor = LanguageCodeExtractor()
>>> extractor.extract_standalone("Movie.2024.UKR.ENG.1080p.mkv")
['ukr', 'eng']
"""
langs = []
# Remove bracketed content first
text_without_brackets = re.sub(r'\[([^\]]+)\]', '', text)
# Split on dots, spaces, and underscores
parts = re.split(r'[.\s_]+', text_without_brackets)
for part in parts:
part = part.strip()
if not part or len(part) < 2:
continue
part_lower = part.lower()
# Check if this is a 2-3 letter code
if re.match(r'^[a-zA-Z]{2,3}$', part):
# Skip title case 2-letter words to avoid false positives
if part.istitle() and len(part) == 2:
continue
# For title case, only allow known language codes
if part.istitle() and part_lower not in self.ALLOWED_TITLE_CASE:
continue
# Skip common words and non-language codes
if part_lower in self.SKIP_WORDS:
continue
# Check if it's a known language code
if part_lower in self.KNOWN_CODES:
iso3_code = self._convert_to_iso3(part_lower)
if iso3_code:
langs.append(iso3_code)
return langs
def extract_all(self, text: str) -> list[str]:
"""Extract all language codes from text (both bracketed and standalone).
Args:
text: Text to extract language codes from
Returns:
List of ISO 639-3 language codes (3-letter), duplicates removed
while preserving order
Example:
>>> extractor = LanguageCodeExtractor()
>>> extractor.extract_all("Movie [UKR_ENG] 2024.rus.mkv")
['ukr', 'eng', 'rus']
"""
# Extract from both sources
bracketed = self.extract_from_brackets(text)
standalone = self.extract_standalone(text)
# Combine while removing duplicates but preserving order
seen = set()
result = []
for lang in bracketed + standalone:
if lang not in seen:
seen.add(lang)
result.append(lang)
return result
def format_lang_counts(self, langs: list[str]) -> str:
"""Format language list with counts like MediaInfo.
Formats like: "2ukr,eng" for 2 Ukrainian tracks and 1 English track.
Args:
langs: List of language codes (can have duplicates)
Returns:
Formatted string with counts
Example:
>>> extractor = LanguageCodeExtractor()
>>> extractor.format_lang_counts(['ukr', 'ukr', 'eng'])
'2ukr,eng'
"""
if not langs:
return ''
# Count occurrences while preserving order of first appearance
lang_counts = {}
lang_order = []
for lang in langs:
if lang not in lang_counts:
lang_counts[lang] = 0
lang_order.append(lang)
lang_counts[lang] += 1
# Format with counts
formatted = []
for lang in lang_order:
count = lang_counts[lang]
formatted.append(f"{count}{lang}" if count > 1 else lang)
return ','.join(formatted)
def _convert_to_iso3(self, lang_code: str) -> Optional[str]:
"""Convert a language code to ISO 639-3 (3-letter code).
Args:
lang_code: 2 or 3 letter language code
Returns:
ISO 639-3 code or None if invalid
Example:
>>> extractor = LanguageCodeExtractor()
>>> extractor._convert_to_iso3('en')
'eng'
>>> extractor._convert_to_iso3('ukr')
'ukr'
"""
try:
lang_obj = langcodes.Language.get(lang_code)
return lang_obj.to_alpha3()
except (LookupError, ValueError, AttributeError) as e:
logger.debug(f"Invalid language code '{lang_code}': {e}")
return None
def is_valid_code(self, code: str) -> bool:
"""Check if a code is a valid language code.
Args:
code: The code to check
Returns:
True if valid language code
Example:
>>> extractor = LanguageCodeExtractor()
>>> extractor.is_valid_code('eng')
True
>>> extractor.is_valid_code('xyz')
False
"""
return self._convert_to_iso3(code) is not None