From a9b9771b92371f0cfe0d13dd0aec39c1f159ed42 Mon Sep 17 00:00:00 2001 From: Walter Behmann Date: Wed, 20 Aug 2025 00:32:49 +0200 Subject: [PATCH 01/18] Admin/XMover: Add XMover - CrateDB Shard Analyzer and Movement Tool --- CHANGES.md | 2 + cratedb_toolkit/admin/__init__.py | 0 cratedb_toolkit/admin/xmover/__init__.py | 10 + cratedb_toolkit/admin/xmover/analyzer.py | 1005 +++++++++++++++ cratedb_toolkit/admin/xmover/attic.py | 118 ++ cratedb_toolkit/admin/xmover/cli.py | 1431 ++++++++++++++++++++++ cratedb_toolkit/admin/xmover/database.py | 584 +++++++++ cratedb_toolkit/cli.py | 2 + doc/admin/index.md | 7 + doc/admin/xmover/handbook.md | 487 ++++++++ doc/admin/xmover/index.md | 29 + doc/admin/xmover/queries.md | 212 ++++ doc/admin/xmover/troubleshooting.md | 424 +++++++ doc/index.md | 1 + pyproject.toml | 2 + 15 files changed, 4314 insertions(+) create mode 100644 cratedb_toolkit/admin/__init__.py create mode 100644 cratedb_toolkit/admin/xmover/__init__.py create mode 100644 cratedb_toolkit/admin/xmover/analyzer.py create mode 100644 cratedb_toolkit/admin/xmover/attic.py create mode 100644 cratedb_toolkit/admin/xmover/cli.py create mode 100644 cratedb_toolkit/admin/xmover/database.py create mode 100644 doc/admin/index.md create mode 100644 doc/admin/xmover/handbook.md create mode 100644 doc/admin/xmover/index.md create mode 100644 doc/admin/xmover/queries.md create mode 100644 doc/admin/xmover/troubleshooting.md diff --git a/CHANGES.md b/CHANGES.md index d47196a2..7ef70478 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,8 @@ # Changelog ## Unreleased +- Admin: Added XMover - CrateDB Shard Analyzer and Movement Tool. + Thanks, @WalBeh. ## 2025/08/19 v0.0.41 - I/O: Updated to `influxio-0.6.0`. Thanks, @ZillKhan. diff --git a/cratedb_toolkit/admin/__init__.py b/cratedb_toolkit/admin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cratedb_toolkit/admin/xmover/__init__.py b/cratedb_toolkit/admin/xmover/__init__.py new file mode 100644 index 00000000..b941f602 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/__init__.py @@ -0,0 +1,10 @@ +""" +XMover - CrateDB Shard Analyzer and Movement Tool + +A tool for analyzing CrateDB shard distribution across nodes and availability zones, +and generating safe SQL commands for shard rebalancing. +""" + +__version__ = "0.1.0" +__author__ = "CrateDB Tools" +__description__ = "CrateDB shard analyzer and movement tool" \ No newline at end of file diff --git a/cratedb_toolkit/admin/xmover/analyzer.py b/cratedb_toolkit/admin/xmover/analyzer.py new file mode 100644 index 00000000..75af9090 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/analyzer.py @@ -0,0 +1,1005 @@ +""" +Shard analysis and rebalancing logic for CrateDB +""" + +from typing import Dict, List, Optional, Set, Any, Tuple +from dataclasses import dataclass +from collections import defaultdict +import math + +from .database import CrateDBClient, NodeInfo, ShardInfo, RecoveryInfo + + +@dataclass +class MoveRecommendation: + """Recommendation for moving a shard""" + table_name: str + schema_name: str + shard_id: int + from_node: str + to_node: str + from_zone: str + to_zone: str + shard_type: str + size_gb: float + reason: str + + def to_sql(self) -> str: + """Generate the SQL command for this move""" + return (f'ALTER TABLE "{self.schema_name}"."{self.table_name}" ' + f"REROUTE MOVE SHARD {self.shard_id} " + f"FROM '{self.from_node}' TO '{self.to_node}';") + + @property + def safety_score(self) -> float: + """Calculate a safety score for this move (0-1, higher is safer)""" + score = 1.0 + + # Penalize if moving to same zone (not ideal for zone distribution) + if self.from_zone == self.to_zone: + score -= 0.3 + + # Bonus for zone balancing moves + if "rebalancing" in self.reason.lower(): + score += 0.2 + + # Ensure score stays in valid range + return max(0.0, min(1.0, score)) + + +@dataclass +class DistributionStats: + """Statistics about shard distribution""" + total_shards: int + total_size_gb: float + zones: Dict[str, int] + nodes: Dict[str, int] + zone_balance_score: float # 0-100, higher is better + node_balance_score: float # 0-100, higher is better + + +class ShardAnalyzer: + """Analyzer for CrateDB shard distribution and rebalancing""" + + def __init__(self, client: CrateDBClient): + self.client = client + self.nodes: List[NodeInfo] = [] + self.shards: List[ShardInfo] = [] + + # Initialize session-based caches for performance + self._zone_conflict_cache = {} + self._node_lookup_cache = {} + self._target_nodes_cache = {} + self._cache_hits = 0 + self._cache_misses = 0 + + self._refresh_data() + + def _refresh_data(self): + """Refresh node and shard data from the database""" + self.nodes = self.client.get_nodes_info() + # For analysis, get all shards regardless of state + self.shards = self.client.get_shards_info(for_analysis=True) + + def analyze_distribution(self, table_name: Optional[str] = None) -> DistributionStats: + """Analyze the current shard distribution""" + # Filter shards by table if specified + shards = self.shards + if table_name: + shards = [s for s in shards if s.table_name == table_name] + + if not shards: + return DistributionStats(0, 0.0, {}, {}, 100.0, 100.0) + + total_shards = len(shards) + total_size_gb = sum(s.size_gb for s in shards) + + # Count by zone and node + zone_counts = defaultdict(int) + node_counts = defaultdict(int) + + for shard in shards: + zone_counts[shard.zone] += 1 + node_counts[shard.node_name] += 1 + + # Calculate balance scores + zone_balance_score = self._calculate_balance_score(list(zone_counts.values())) + node_balance_score = self._calculate_balance_score(list(node_counts.values())) + + return DistributionStats( + total_shards=total_shards, + total_size_gb=total_size_gb, + zones=dict(zone_counts), + nodes=dict(node_counts), + zone_balance_score=zone_balance_score, + node_balance_score=node_balance_score + ) + + def _calculate_balance_score(self, counts: List[int]) -> float: + """Calculate a balance score (0-100) for a distribution""" + if not counts or len(counts) <= 1: + return 100.0 + + mean_count = sum(counts) / len(counts) + if mean_count == 0: + return 100.0 + + # Calculate coefficient of variation + variance = sum((count - mean_count) ** 2 for count in counts) / len(counts) + std_dev = math.sqrt(variance) + cv = std_dev / mean_count + + # Convert to score (lower CV = higher score) + # CV of 0 = 100%, CV of 1 = ~37%, CV of 2 = ~14% + score = max(0, 100 * math.exp(-cv)) + return round(score, 1) + + def find_moveable_shards(self, + min_size_gb: float = 40.0, + max_size_gb: float = 60.0, + table_name: Optional[str] = None) -> List[ShardInfo]: + """Find shards that are candidates for moving based on size + + Only returns healthy shards that are safe to move. + Prioritizes shards from nodes with less available space. + """ + # Get only healthy shards (STARTED + 100% recovered) for safe operations + healthy_shards = self.client.get_shards_info( + table_name=table_name, + min_size_gb=min_size_gb, + max_size_gb=max_size_gb, + for_analysis=False # Only operational shards + ) + + + # Create a mapping of node names to available space + node_space_map = {node.name: node.available_space_gb for node in self.nodes} + + # Sort by node available space (ascending, so low space nodes first), then by shard size + healthy_shards.sort(key=lambda s: (node_space_map.get(s.node_name, float('inf')), s.size_gb)) + return healthy_shards + + def check_zone_balance(self, + table_name: Optional[str] = None, + tolerance_percent: float = 10.0) -> Dict[str, Dict[str, int]]: + """Check if zones are balanced within tolerance""" + # Filter shards by table if specified + shards = self.shards + if table_name: + shards = [s for s in shards if s.table_name == table_name] + + # Count shards by zone and type + zone_stats = defaultdict(lambda: {'PRIMARY': 0, 'REPLICA': 0, 'TOTAL': 0}) + + for shard in shards: + shard_type = shard.shard_type + zone_stats[shard.zone][shard_type] += 1 + zone_stats[shard.zone]['TOTAL'] += 1 + + return dict(zone_stats) + + def find_nodes_with_capacity(self, + required_space_gb: float, + exclude_zones: Optional[Set[str]] = None, + exclude_nodes: Optional[Set[str]] = None, + min_free_space_gb: float = 100.0, + max_disk_usage_percent: float = 85.0) -> List[NodeInfo]: + """Find nodes that have capacity for additional shards + + Args: + required_space_gb: Minimum space needed for the shard + exclude_zones: Zones to exclude from consideration + exclude_nodes: Specific nodes to exclude + min_free_space_gb: Additional buffer space required + max_disk_usage_percent: Maximum disk usage percentage allowed + """ + available_nodes = [] + + for node in self.nodes: + # Skip zones we want to exclude + if exclude_zones and node.zone in exclude_zones: + continue + + # Skip specific nodes we want to exclude + if exclude_nodes and node.name in exclude_nodes: + continue + + # Check disk usage threshold + if node.disk_usage_percent > max_disk_usage_percent: + continue + + # Check if node has enough free space + free_space_gb = node.available_space_gb + if free_space_gb >= (required_space_gb + min_free_space_gb): + available_nodes.append(node) + else: + continue + + # Sort by available space (most space first) - prioritize nodes with more free space + available_nodes.sort(key=lambda n: n.available_space_gb, reverse=True) + return available_nodes + + def generate_rebalancing_recommendations(self, + table_name: Optional[str] = None, + min_size_gb: float = 40.0, + max_size_gb: float = 60.0, + zone_tolerance_percent: float = 10.0, + min_free_space_gb: float = 100.0, + max_recommendations: int = 10, + prioritize_space: bool = False, + source_node: Optional[str] = None, + max_disk_usage_percent: float = 90.0) -> List[MoveRecommendation]: + """Generate recommendations for rebalancing shards + + Args: + prioritize_space: If True, prioritizes moving shards from nodes with less available space + regardless of zone balance. If False, prioritizes zone balancing first. + source_node: If specified, only generate recommendations for shards on this node + max_disk_usage_percent: Maximum disk usage percentage for target nodes + """ + recommendations = [] + + # Get moveable shards (only healthy ones for actual operations) + moveable_shards = self.find_moveable_shards(min_size_gb, max_size_gb, table_name) + + print(f"Analyzing {len(moveable_shards)} candidate shards in size range {min_size_gb}-{max_size_gb}GB...") + + if not moveable_shards: + return recommendations + + # Analyze current zone balance + zone_stats = self.check_zone_balance(table_name, zone_tolerance_percent) + + # Calculate target distribution + total_shards = sum(stats['TOTAL'] for stats in zone_stats.values()) + zones = list(zone_stats.keys()) + target_per_zone = total_shards // len(zones) if zones else 0 + + # Find zones that are over/under capacity + overloaded_zones = [] + underloaded_zones = [] + + for zone, stats in zone_stats.items(): + current_count = stats['TOTAL'] + threshold_high = target_per_zone * (1 + zone_tolerance_percent / 100) + threshold_low = target_per_zone * (1 - zone_tolerance_percent / 100) + + if current_count > threshold_high: + overloaded_zones.append(zone) + elif current_count < threshold_low: + underloaded_zones.append(zone) + + # Optimize processing: if filtering by source node, only process those shards + if source_node: + processing_shards = [s for s in moveable_shards if s.node_name == source_node] + print(f"Focusing on {len(processing_shards)} shards from node {source_node}") + else: + processing_shards = moveable_shards + + # Generate move recommendations + safe_recommendations = 0 + total_evaluated = 0 + + for i, shard in enumerate(processing_shards): + if len(recommendations) >= max_recommendations: + break + + # Show progress every 50 shards when processing many + if len(processing_shards) > 100 and i > 0 and i % 50 == 0: + print(".", end="", flush=True) + + total_evaluated += 1 + + # Skip based on priority mode + if not prioritize_space: + # Zone balancing mode: only move shards from overloaded zones + if shard.zone not in overloaded_zones: + continue + # In space priority mode, consider all shards regardless of zone balance + + # Find target nodes, excluding the source node and prioritizing by available space (with caching) + target_nodes = self._find_nodes_with_capacity_cached( + required_space_gb=shard.size_gb, + exclude_nodes={shard.node_name}, # Don't move to same node + min_free_space_gb=min_free_space_gb, + max_disk_usage_percent=max_disk_usage_percent + ) + + # Quick pre-filter to avoid expensive safety validations + # Only check nodes in different zones (for zone balancing) + if not prioritize_space: + target_nodes = [node for node in target_nodes if node.zone != shard.zone] + + # Limit to top 3 candidates to reduce validation overhead + target_nodes = target_nodes[:3] + + # Filter target nodes to find safe candidates + safe_target_nodes = [] + for candidate_node in target_nodes: + # Create a temporary recommendation to test safety + temp_rec = MoveRecommendation( + table_name=shard.table_name, + schema_name=shard.schema_name, + shard_id=shard.shard_id, + from_node=shard.node_name, + to_node=candidate_node.name, + from_zone=shard.zone, + to_zone=candidate_node.zone, + shard_type=shard.shard_type, + size_gb=shard.size_gb, + reason="Safety validation" + ) + + # Check if this move would be safe + is_safe, safety_msg = self.validate_move_safety(temp_rec, max_disk_usage_percent) + if is_safe: + safe_target_nodes.append(candidate_node) + + if not safe_target_nodes: + continue # No safe targets found, skip this shard + + if prioritize_space: + # Space priority mode: choose node with most available space + target_node = safe_target_nodes[0] # Already sorted by available space (desc) + else: + # Zone balance mode: prefer underloaded zones, then available space + target_zones = set(underloaded_zones) - {shard.zone} + preferred_nodes = [n for n in safe_target_nodes if n.zone in target_zones] + other_nodes = [n for n in safe_target_nodes if n.zone not in target_zones] + + # Choose target node with intelligent priority: + # 1. If a node has significantly more space (2x) than zone-preferred nodes, prioritize space + # 2. Otherwise, prefer zone balancing first, then available space + target_node = None + + if preferred_nodes and other_nodes: + best_preferred = preferred_nodes[0] # Most space in preferred zones + best_other = other_nodes[0] # Most space in other zones + + # If the best "other" node has significantly more space (2x), choose it + if best_other.available_space_gb >= (best_preferred.available_space_gb * 2): + target_node = best_other + else: + target_node = best_preferred + elif preferred_nodes: + target_node = preferred_nodes[0] + elif other_nodes: + target_node = other_nodes[0] + else: + continue # No suitable target found + + # Determine the reason for the move + if prioritize_space: + if shard.zone == target_node.zone: + reason = f"Space optimization within {shard.zone}" + else: + reason = f"Space optimization: {shard.zone} -> {target_node.zone}" + else: + reason = f"Zone rebalancing: {shard.zone} -> {target_node.zone}" + if shard.zone == target_node.zone: + reason = f"Node balancing within {shard.zone}" + + recommendation = MoveRecommendation( + table_name=shard.table_name, + schema_name=shard.schema_name, + shard_id=shard.shard_id, + from_node=shard.node_name, + to_node=target_node.name, + from_zone=shard.zone, + to_zone=target_node.zone, + shard_type=shard.shard_type, + size_gb=shard.size_gb, + reason=reason + ) + + recommendations.append(recommendation) + + if len(processing_shards) > 100: + print() # New line after progress dots + print(f"Generated {len(recommendations)} move recommendations (evaluated {total_evaluated} shards)") + print(f"Performance: {self.get_cache_stats()}") + return recommendations + + def validate_move_safety(self, recommendation: MoveRecommendation, + max_disk_usage_percent: float = 90.0) -> Tuple[bool, str]: + """Validate that a move recommendation is safe to execute""" + # Find target node (with caching) + target_node = self._get_node_cached(recommendation.to_node) + + if not target_node: + return False, f"Target node '{recommendation.to_node}' not found" + + # Check for zone conflicts (same shard already exists in target zone) - with caching + zone_conflict = self._check_zone_conflict_cached(recommendation) + if zone_conflict: + return False, zone_conflict + + # Check available space + required_space_gb = recommendation.size_gb + 50 # 50GB buffer + if target_node.available_space_gb < required_space_gb: + return False, f"Insufficient space on target node (need {required_space_gb:.1f}GB, have {target_node.available_space_gb:.1f}GB)" + + # Check disk usage + if target_node.disk_usage_percent > max_disk_usage_percent: + return False, f"Target node disk usage too high ({target_node.disk_usage_percent:.1f}%)" + + return True, "Move appears safe" + + def _get_node_cached(self, node_name: str): + """Get node by name with caching""" + if node_name in self._node_lookup_cache: + self._cache_hits += 1 + return self._node_lookup_cache[node_name] + + # Find node (cache miss) + self._cache_misses += 1 + target_node = None + for node in self.nodes: + if node.name == node_name: + target_node = node + break + + self._node_lookup_cache[node_name] = target_node + return target_node + + def _check_zone_conflict_cached(self, recommendation: MoveRecommendation) -> Optional[str]: + """Check zone conflicts with caching""" + # Create cache key: table, shard, target zone + target_zone = self._get_node_zone(recommendation.to_node) + cache_key = (recommendation.table_name, recommendation.shard_id, target_zone) + + if cache_key in self._zone_conflict_cache: + self._cache_hits += 1 + return self._zone_conflict_cache[cache_key] + + # Cache miss - do expensive check + self._cache_misses += 1 + result = self._check_zone_conflict(recommendation) + self._zone_conflict_cache[cache_key] = result + return result + + def _get_node_zone(self, node_name: str) -> str: + """Get zone for a node name""" + node = self._get_node_cached(node_name) + return node.zone if node else "unknown" + + def get_cache_stats(self) -> str: + """Get cache performance statistics""" + total = self._cache_hits + self._cache_misses + if total == 0: + return "Cache stats: No operations yet" + + hit_rate = (self._cache_hits / total) * 100 + return f"Cache stats: {hit_rate:.1f}% hit rate ({self._cache_hits} hits, {self._cache_misses} misses)" + + def _find_nodes_with_capacity_cached(self, required_space_gb: float, exclude_nodes: set, + min_free_space_gb: float, max_disk_usage_percent: float) -> List[NodeInfo]: + """Find nodes with capacity using caching for repeated queries""" + # Create cache key based on parameters (rounded to avoid float precision issues) + cache_key = ( + round(required_space_gb, 1), + frozenset(exclude_nodes), + round(min_free_space_gb, 1), + round(max_disk_usage_percent, 1) + ) + + if cache_key in self._target_nodes_cache: + self._cache_hits += 1 + return self._target_nodes_cache[cache_key] + + # Cache miss - do expensive calculation + self._cache_misses += 1 + result = self.find_nodes_with_capacity( + required_space_gb=required_space_gb, + exclude_nodes=exclude_nodes, + min_free_space_gb=min_free_space_gb, + max_disk_usage_percent=max_disk_usage_percent + ) + + self._target_nodes_cache[cache_key] = result + return result + + def _check_zone_conflict(self, recommendation: MoveRecommendation) -> Optional[str]: + """Check if moving this shard would create a zone conflict + + Performs comprehensive zone safety analysis: + - Checks if target node already has a copy of this shard + - Checks if target zone already has copies + - Analyzes zone allocation limits and CrateDB's zone awareness rules + - Ensures move doesn't violate zone-awareness principles + """ + try: + # Query to get all copies of this shard across nodes and zones + query = """ + SELECT + s.node['id'] as node_id, + s.node['name'] as node_name, + n.attributes['zone'] as zone, + s."primary" as is_primary, + s.routing_state, + s.state + FROM sys.shards s + JOIN sys.nodes n ON s.node['id'] = n.id + WHERE s.table_name = ? + AND s.schema_name = ? + AND s.id = ? + ORDER BY s."primary" DESC, zone, node_name + """ + + result = self.client.execute_query(query, [ + recommendation.table_name, + recommendation.schema_name, + recommendation.shard_id + ]) + + if not result.get('rows'): + return f"Cannot find shard {recommendation.shard_id} for table {recommendation.schema_name}.{recommendation.table_name}" + + # Analyze current distribution + zones_with_copies = set() + nodes_with_copies = set() + current_location = None + healthy_copies = 0 + total_copies = 0 + target_node_id = None + + # Get target node ID for the recommendation + for node in self.nodes: + if node.name == recommendation.to_node: + target_node_id = node.id + break + + if not target_node_id: + return f"Target node {recommendation.to_node} not found in cluster" + + for row in result['rows']: + node_id, node_name, zone, is_primary, routing_state, state = row + zone = zone or 'unknown' + total_copies += 1 + + # Track the shard we're planning to move + if node_name == recommendation.from_node: + current_location = { + 'zone': zone, + 'is_primary': is_primary, + 'routing_state': routing_state, + 'state': state + } + + # Track all copies for conflict detection + nodes_with_copies.add(node_id) + if routing_state == 'STARTED' and state == 'STARTED': + healthy_copies += 1 + zones_with_copies.add(zone) + + # Validate the shard we're trying to move exists and is healthy + if not current_location: + return f"Shard not found on source node {recommendation.from_node}" + + if current_location['routing_state'] != 'STARTED': + return f"Source shard is not in STARTED state (current: {current_location['routing_state']})" + + # CRITICAL CHECK 1: Target node already has a copy of this shard + if target_node_id in nodes_with_copies: + return f"Node conflict: Target node {recommendation.to_node} already has a copy of shard {recommendation.shard_id}" + + # CRITICAL CHECK 2: Target zone already has a copy (zone allocation limits) + if recommendation.to_zone in zones_with_copies: + return f"Zone conflict: {recommendation.to_zone} already has a copy of shard {recommendation.shard_id}" + + # CRITICAL CHECK 3: Ensure we're not creating a single point of failure + if len(zones_with_copies) == 1 and current_location['zone'] in zones_with_copies: + # This is the only zone with this shard - moving it is good for zone distribution + pass + elif len(zones_with_copies) <= 1 and healthy_copies <= 1: + return f"Safety concern: Only {healthy_copies} healthy copy(ies) exist. Moving might risk data availability." + + # ADDITIONAL CHECK: Verify zone allocation constraints for this table + table_zone_query = """ + SELECT + n.attributes['zone'] as zone, + COUNT(*) as shard_count + FROM sys.shards s + JOIN sys.nodes n ON s.node['id'] = n.id + WHERE s.table_name = ? + AND s.schema_name = ? + AND s.id = ? + AND s.routing_state = 'STARTED' + GROUP BY n.attributes['zone'] + ORDER BY zone + """ + + zone_result = self.client.execute_query(table_zone_query, [ + recommendation.table_name, + recommendation.schema_name, + recommendation.shard_id + ]) + + current_zone_counts = {} + for row in zone_result.get('rows', []): + zone_name, count = row + current_zone_counts[zone_name or 'unknown'] = count + + # Check if adding to target zone would violate balance + target_zone_count = current_zone_counts.get(recommendation.to_zone, 0) + if target_zone_count > 0: + return f"Zone allocation violation: {recommendation.to_zone} would have {target_zone_count + 1} copies after move" + + return None + + except Exception as e: + # If we can't check, err on the side of caution + return f"Cannot verify zone safety: {str(e)}" + + def get_cluster_overview(self) -> Dict[str, Any]: + """Get a comprehensive overview of the cluster""" + # Get cluster watermark settings + watermarks = self.client.get_cluster_watermarks() + + overview = { + 'nodes': len(self.nodes), + 'zones': len(set(node.zone for node in self.nodes)), + 'total_shards': len(self.shards), + 'primary_shards': len([s for s in self.shards if s.is_primary]), + 'replica_shards': len([s for s in self.shards if not s.is_primary]), + 'total_size_gb': sum(s.size_gb for s in self.shards), + 'zone_distribution': defaultdict(int), + 'node_health': [], + 'watermarks': watermarks + } + + # Zone distribution + for shard in self.shards: + overview['zone_distribution'][shard.zone] += 1 + overview['zone_distribution'] = dict(overview['zone_distribution']) + + # Node health with watermark calculations + for node in self.nodes: + node_shards = [s for s in self.shards if s.node_name == node.name] + watermark_info = self._calculate_node_watermark_remaining(node, watermarks) + + overview['node_health'].append({ + 'name': node.name, + 'zone': node.zone, + 'shards': len(node_shards), + 'size_gb': sum(s.size_gb for s in node_shards), + 'disk_usage_percent': node.disk_usage_percent, + 'heap_usage_percent': node.heap_usage_percent, + 'available_space_gb': node.available_space_gb, + 'remaining_to_low_watermark_gb': watermark_info['remaining_to_low_gb'], + 'remaining_to_high_watermark_gb': watermark_info['remaining_to_high_gb'] + }) + + return overview + + def _calculate_node_watermark_remaining(self, node: 'NodeInfo', watermarks: Dict[str, Any]) -> Dict[str, float]: + """Calculate remaining space until watermarks are reached""" + + # Parse watermark percentages + low_watermark = self._parse_watermark_percentage(watermarks.get('low', '85%')) + high_watermark = self._parse_watermark_percentage(watermarks.get('high', '90%')) + + # Calculate remaining space to each watermark + total_space_bytes = node.fs_total + current_used_bytes = node.fs_used + + # Space that would be used at each watermark + low_watermark_used_bytes = total_space_bytes * (low_watermark / 100.0) + high_watermark_used_bytes = total_space_bytes * (high_watermark / 100.0) + + # Remaining space until each watermark (negative if already exceeded) + remaining_to_low_gb = max(0, (low_watermark_used_bytes - current_used_bytes) / (1024**3)) + remaining_to_high_gb = max(0, (high_watermark_used_bytes - current_used_bytes) / (1024**3)) + + return { + 'remaining_to_low_gb': remaining_to_low_gb, + 'remaining_to_high_gb': remaining_to_high_gb + } + + def _parse_watermark_percentage(self, watermark_value: str) -> float: + """Parse watermark percentage from string like '85%' or '0.85'""" + if isinstance(watermark_value, str): + if watermark_value.endswith('%'): + return float(watermark_value[:-1]) + else: + # Handle decimal format like '0.85' + decimal_value = float(watermark_value) + if decimal_value <= 1.0: + return decimal_value * 100 + return decimal_value + elif isinstance(watermark_value, (int, float)): + if watermark_value <= 1.0: + return watermark_value * 100 + return watermark_value + else: + # Default to common values if parsing fails + return 85.0 # Default low watermark + + def plan_node_decommission(self, node_name: str, + min_free_space_gb: float = 100.0) -> Dict[str, Any]: + """Plan the decommissioning of a node by analyzing required shard moves + + Args: + node_name: Name of the node to decommission + min_free_space_gb: Minimum free space required on target nodes + + Returns: + Dictionary with decommission plan and analysis + """ + # Find the node to decommission + target_node = None + for node in self.nodes: + if node.name == node_name: + target_node = node + break + + if not target_node: + return { + 'error': f"Node {node_name} not found in cluster", + 'feasible': False + } + + # Get all shards on this node (only healthy ones for safety) + node_shards = [s for s in self.shards + if s.node_name == node_name + and s.routing_state == 'STARTED'] + + if not node_shards: + return { + 'node': node_name, + 'zone': target_node.zone, + 'feasible': True, + 'shards_to_move': 0, + 'total_size_gb': 0, + 'recommendations': [], + 'warnings': [], + 'message': 'Node has no healthy shards - safe to decommission' + } + + # Calculate space requirements + total_size_gb = sum(s.size_gb for s in node_shards) + + # Find potential target nodes for each shard + move_plan = [] + warnings = [] + infeasible_moves = [] + + for shard in node_shards: + # Find nodes that can accommodate this shard + potential_targets = self.find_nodes_with_capacity( + shard.size_gb, + exclude_nodes={node_name}, + min_free_space_gb=min_free_space_gb + ) + + if not potential_targets: + infeasible_moves.append({ + 'shard': f"{shard.schema_name}.{shard.table_name}[{shard.shard_id}]", + 'size_gb': shard.size_gb, + 'reason': 'No nodes with sufficient capacity' + }) + continue + + # Check for zone conflicts + safe_targets = [] + for target in potential_targets: + # Create a temporary recommendation to test zone safety + temp_rec = MoveRecommendation( + table_name=shard.table_name, + schema_name=shard.schema_name, + shard_id=shard.shard_id, + from_node=node_name, + to_node=target.name, + from_zone=shard.zone, + to_zone=target.zone, + shard_type=shard.shard_type, + size_gb=shard.size_gb, + reason=f"Node decommission: {node_name}" + ) + + zone_conflict = self._check_zone_conflict(temp_rec) + if not zone_conflict: + safe_targets.append(target) + else: + warnings.append(f"Zone conflict for {shard.schema_name}.{shard.table_name}[{shard.shard_id}]: {zone_conflict}") + + if safe_targets: + # Choose the target with most available space + best_target = safe_targets[0] + move_plan.append(MoveRecommendation( + table_name=shard.table_name, + schema_name=shard.schema_name, + shard_id=shard.shard_id, + from_node=node_name, + to_node=best_target.name, + from_zone=shard.zone, + to_zone=best_target.zone, + shard_type=shard.shard_type, + size_gb=shard.size_gb, + reason=f"Node decommission: {node_name}" + )) + else: + infeasible_moves.append({ + 'shard': f"{shard.schema_name}.{shard.table_name}[{shard.shard_id}]", + 'size_gb': shard.size_gb, + 'reason': 'Zone conflicts prevent safe move' + }) + + # Determine feasibility + feasible = len(infeasible_moves) == 0 + + # Add capacity warnings + if feasible: + # Check if remaining cluster capacity is sufficient after decommission + remaining_capacity = sum(n.available_space_gb for n in self.nodes if n.name != node_name) + if remaining_capacity < total_size_gb * 1.2: # 20% safety margin + warnings.append(f"Low remaining capacity after decommission. Only {remaining_capacity:.1f}GB available for {total_size_gb:.1f}GB of data") + + return { + 'node': node_name, + 'zone': target_node.zone, + 'feasible': feasible, + 'shards_to_move': len(node_shards), + 'moveable_shards': len(move_plan), + 'total_size_gb': total_size_gb, + 'recommendations': move_plan, + 'infeasible_moves': infeasible_moves, + 'warnings': warnings, + 'estimated_time_hours': len(move_plan) * 0.1, # Rough estimate: 6 minutes per move + 'message': 'Decommission plan generated' if feasible else 'Decommission not currently feasible' + } + + +class RecoveryMonitor: + """Monitor shard recovery operations""" + + def __init__(self, client: CrateDBClient): + self.client = client + + def get_cluster_recovery_status(self, + table_name: Optional[str] = None, + node_name: Optional[str] = None, + recovery_type_filter: str = 'all', + include_transitioning: bool = False) -> List[RecoveryInfo]: + """Get comprehensive recovery status with minimal cluster impact""" + + # Get all recovering shards using the efficient combined query + recoveries = self.client.get_all_recovering_shards(table_name, node_name, include_transitioning) + + # Apply recovery type filter + if recovery_type_filter != 'all': + recoveries = [r for r in recoveries if r.recovery_type.upper() == recovery_type_filter.upper()] + + return recoveries + + def get_recovery_summary(self, recoveries: List[RecoveryInfo]) -> Dict[str, Any]: + """Generate a summary of recovery operations""" + + if not recoveries: + return { + 'total_recoveries': 0, + 'by_type': {}, + 'by_stage': {}, + 'avg_progress': 0.0, + 'total_size_gb': 0.0 + } + + # Group by recovery type + by_type = {} + by_stage = {} + total_progress = 0.0 + total_size_gb = 0.0 + + for recovery in recoveries: + # By type + if recovery.recovery_type not in by_type: + by_type[recovery.recovery_type] = { + 'count': 0, + 'total_size_gb': 0.0, + 'avg_progress': 0.0 + } + by_type[recovery.recovery_type]['count'] += 1 + by_type[recovery.recovery_type]['total_size_gb'] += recovery.size_gb + + # By stage + if recovery.stage not in by_stage: + by_stage[recovery.stage] = 0 + by_stage[recovery.stage] += 1 + + # Totals + total_progress += recovery.overall_progress + total_size_gb += recovery.size_gb + + # Calculate averages + for type_name, rec_type in by_type.items(): + if rec_type['count'] > 0: + type_recoveries = [r for r in recoveries if r.recovery_type == type_name] + if type_recoveries: + rec_type['avg_progress'] = sum(r.overall_progress for r in type_recoveries) / len(type_recoveries) + + return { + 'total_recoveries': len(recoveries), + 'by_type': by_type, + 'by_stage': by_stage, + 'avg_progress': total_progress / len(recoveries) if recoveries else 0.0, + 'total_size_gb': total_size_gb + } + + def format_recovery_display(self, recoveries: List[RecoveryInfo]) -> str: + """Format recovery information for display""" + + if not recoveries: + return "βœ… No active shard recoveries found" + + # Group by recovery type + peer_recoveries = [r for r in recoveries if r.recovery_type == 'PEER'] + disk_recoveries = [r for r in recoveries if r.recovery_type == 'DISK'] + other_recoveries = [r for r in recoveries if r.recovery_type not in ['PEER', 'DISK']] + + output = [f"\nπŸ”„ Active Shard Recoveries ({len(recoveries)} total)"] + output.append("=" * 80) + + if peer_recoveries: + output.append(f"\nπŸ“‘ PEER Recoveries ({len(peer_recoveries)})") + output.append(self._format_recovery_table(peer_recoveries)) + + if disk_recoveries: + output.append(f"\nπŸ’Ύ DISK Recoveries ({len(disk_recoveries)})") + output.append(self._format_recovery_table(disk_recoveries)) + + if other_recoveries: + output.append(f"\nπŸ”§ Other Recoveries ({len(other_recoveries)})") + output.append(self._format_recovery_table(other_recoveries)) + + # Add summary + summary = self.get_recovery_summary(recoveries) + output.append(f"\nπŸ“Š Summary:") + output.append(f" Total size: {summary['total_size_gb']:.1f} GB") + output.append(f" Average progress: {summary['avg_progress']:.1f}%") + + return "\n".join(output) + + def _format_recovery_table(self, recoveries: List[RecoveryInfo]) -> str: + """Format a table of recovery information""" + + if not recoveries: + return " No recoveries of this type" + + # Table headers + headers = ["Table", "Shard", "Node", "Type", "Stage", "Progress", "Size(GB)", "Time(s)"] + + # Calculate column widths + col_widths = [len(h) for h in headers] + + rows = [] + for recovery in recoveries: + row = [ + f"{recovery.schema_name}.{recovery.table_name}", + str(recovery.shard_id), + recovery.node_name, + recovery.shard_type, + recovery.stage, + f"{recovery.overall_progress:.1f}%", + f"{recovery.size_gb:.1f}", + f"{recovery.total_time_seconds:.1f}" + ] + rows.append(row) + + # Update column widths + for i, cell in enumerate(row): + col_widths[i] = max(col_widths[i], len(cell)) + + # Format table + output = [] + + # Header row + header_row = " " + " | ".join(h.ljust(w) for h, w in zip(headers, col_widths)) + output.append(header_row) + output.append(" " + "-" * (len(header_row) - 3)) + + # Data rows + for row in rows: + data_row = " " + " | ".join(cell.ljust(w) for cell, w in zip(row, col_widths)) + output.append(data_row) + + return "\n".join(output) diff --git a/cratedb_toolkit/admin/xmover/attic.py b/cratedb_toolkit/admin/xmover/attic.py new file mode 100644 index 00000000..3cbfc3ee --- /dev/null +++ b/cratedb_toolkit/admin/xmover/attic.py @@ -0,0 +1,118 @@ +# ruff: noqa + +# @main.command() +# @click.argument('node_name') +# @click.option('--min-free-space', default=100.0, help='Minimum free space required on target nodes in GB (default: 100)') +# @click.option('--dry-run/--execute', default=True, help='Show decommission plan without generating SQL commands (default: True)') +# @click.pass_context +# def decommission(ctx, node_name: str, min_free_space: float, dry_run: bool): +# """Plan decommissioning of a node by analyzing required shard moves +# +# NODE_NAME: Name of the node to decommission +# """ +# client = ctx.obj['client'] +# analyzer = ShardAnalyzer(client) +# +# mode_text = "PLANNING MODE" if dry_run else "EXECUTION MODE" +# console.print(Panel.fit(f"[bold blue]Node Decommission Analysis[/bold blue] - [bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]")) +# console.print(f"[dim]Analyzing decommission plan for node: {node_name}[/dim]") +# console.print() +# +# # Generate decommission plan +# plan = analyzer.plan_node_decommission(node_name, min_free_space) +# +# if 'error' in plan: +# console.print(f"[red]Error: {plan['error']}[/red]") +# return +# +# # Display plan summary +# summary_table = Table(title=f"Decommission Plan for {node_name}", box=box.ROUNDED) +# summary_table.add_column("Metric", style="cyan") +# summary_table.add_column("Value", style="magenta") +# +# summary_table.add_row("Node", plan['node']) +# summary_table.add_row("Zone", plan['zone']) +# summary_table.add_row("Feasible", "[green]βœ“ Yes[/green]" if plan['feasible'] else "[red]βœ— No[/red]") +# summary_table.add_row("Shards to Move", str(plan['shards_to_move'])) +# summary_table.add_row("Moveable Shards", str(plan['moveable_shards'])) +# summary_table.add_row("Total Data Size", format_size(plan['total_size_gb'])) +# summary_table.add_row("Estimated Time", f"{plan['estimated_time_hours']:.1f} hours") +# +# console.print(summary_table) +# console.print() +# +# # Show warnings if any +# if plan['warnings']: +# console.print("[bold yellow]⚠ Warnings:[/bold yellow]") +# for warning in plan['warnings']: +# console.print(f" β€’ [yellow]{warning}[/yellow]") +# console.print() +# +# # Show infeasible moves if any +# if plan['infeasible_moves']: +# console.print("[bold red]βœ— Cannot Move:[/bold red]") +# infeasible_table = Table(box=box.ROUNDED) +# infeasible_table.add_column("Shard", style="cyan") +# infeasible_table.add_column("Size", style="magenta") +# infeasible_table.add_column("Reason", style="red") +# +# for move in plan['infeasible_moves']: +# infeasible_table.add_row( +# move['shard'], +# format_size(move['size_gb']), +# move['reason'] +# ) +# console.print(infeasible_table) +# console.print() +# +# # Show move recommendations +# if plan['recommendations']: +# move_table = Table(title="Required Shard Moves", box=box.ROUNDED) +# move_table.add_column("Table", style="cyan") +# move_table.add_column("Shard", justify="right", style="magenta") +# move_table.add_column("Type", style="blue") +# move_table.add_column("Size", style="green") +# move_table.add_column("From Zone", style="yellow") +# move_table.add_column("To Node", style="cyan") +# move_table.add_column("To Zone", style="yellow") +# +# for rec in plan['recommendations']: +# move_table.add_row( +# f"{rec.schema_name}.{rec.table_name}", +# str(rec.shard_id), +# rec.shard_type, +# format_size(rec.size_gb), +# rec.from_zone, +# rec.to_node, +# rec.to_zone +# ) +# +# console.print(move_table) +# console.print() +# +# # Generate SQL commands if not in dry-run mode +# if not dry_run and plan['feasible']: +# console.print(Panel.fit("[bold green]Decommission SQL Commands[/bold green]")) +# console.print("[dim]# Execute these commands in order to prepare for node decommission[/dim]") +# console.print("[dim]# ALWAYS test in a non-production environment first![/dim]") +# console.print("[dim]# Monitor shard health after each move before proceeding[/dim]") +# console.print() +# +# for i, rec in enumerate(plan['recommendations'], 1): +# console.print(f"-- Move {i}: {rec.reason}") +# console.print(f"{rec.to_sql()}") +# console.print() +# +# console.print(f"-- After all moves complete, the node {node_name} can be safely removed") +# console.print(f"-- Total moves required: {len(plan['recommendations'])}") +# elif dry_run: +# console.print("[green]βœ“ Decommission plan ready. Use --execute to generate SQL commands.[/green]") +# +# # Final status +# if not plan['feasible']: +# console.print(f"[red]⚠ Node {node_name} cannot be safely decommissioned at this time.[/red]") +# console.print("[dim]Address the issues above before attempting decommission.[/dim]") +# elif plan['shards_to_move'] == 0: +# console.print(f"[green]βœ“ Node {node_name} is ready for immediate decommission (no shards to move).[/green]") +# else: +# console.print(f"[green]βœ“ Node {node_name} can be safely decommissioned after moving {len(plan['recommendations'])} shards.[/green]") diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py new file mode 100644 index 00000000..e4f5800d --- /dev/null +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -0,0 +1,1431 @@ +""" +Command line interface for XMover - CrateDB Shard Analyzer and Movement Tool +""" + +import sys +import time +import os +from typing import Optional +try: + import click + from rich.console import Console + from rich.table import Table + from rich.panel import Panel + from rich.text import Text + from rich import box +except ImportError as e: + print(f"Missing required dependency: {e}") + print("Please install dependencies with: pip install -e .") + sys.exit(1) + +from .database import CrateDBClient +from .analyzer import ShardAnalyzer, RecoveryMonitor + + +console = Console() + + +def format_size(size_gb: float) -> str: + """Format size in GB with appropriate precision""" + if size_gb >= 1000: + return f"{size_gb/1000:.1f}TB" + elif size_gb >= 1: + return f"{size_gb:.1f}GB" + else: + return f"{size_gb*1000:.0f}MB" + + +def format_percentage(value: float) -> str: + """Format percentage with color coding""" + color = "green" + if value > 80: + color = "red" + elif value > 70: + color = "yellow" + return f"[{color}]{value:.1f}%[/{color}]" + + +def format_translog_info(recovery_info) -> str: + """Format translog size information with color coding""" + tl_bytes = recovery_info.translog_size_bytes + + # Only show if significant (>10MB for production) + if tl_bytes < 10 * 1024 * 1024: # 10MB for production + return "" + + tl_gb = recovery_info.translog_size_gb + + # Color coding based on size + if tl_gb >= 5.0: + color = "red" + elif tl_gb >= 1.0: + color = "yellow" + else: + color = "green" + + # Format size + if tl_gb >= 1.0: + size_str = f"{tl_gb:.1f}GB" + else: + size_str = f"{tl_gb*1000:.0f}MB" + + return f" [dim]([{color}]TL:{size_str}[/{color}])[/dim]" + + +@click.group() +@click.version_option() +@click.pass_context +def main(ctx): + """XMover - CrateDB Shard Analyzer and Movement Tool + + A tool for analyzing CrateDB shard distribution across nodes and availability zones, + and generating safe SQL commands for shard rebalancing. + """ + ctx.ensure_object(dict) + + # Test connection on startup + try: + client = CrateDBClient() + if not client.test_connection(): + console.print("[red]Error: Could not connect to CrateDB[/red]") + console.print("Please check your CRATE_CONNECTION_STRING in .env file") + sys.exit(1) + ctx.obj['client'] = client + except Exception as e: + console.print(f"[red]Error connecting to CrateDB: {e}[/red]") + sys.exit(1) + + +@main.command() +@click.option('--table', '-t', help='Analyze specific table only') +@click.pass_context +def analyze(ctx, table: Optional[str]): + """Analyze current shard distribution across nodes and zones""" + client = ctx.obj['client'] + analyzer = ShardAnalyzer(client) + + console.print(Panel.fit("[bold blue]CrateDB Cluster Analysis[/bold blue]")) + + # Get cluster overview (includes all shards for complete analysis) + overview = analyzer.get_cluster_overview() + + # Cluster summary table + summary_table = Table(title="Cluster Summary", box=box.ROUNDED) + summary_table.add_column("Metric", style="cyan") + summary_table.add_column("Value", style="magenta") + + summary_table.add_row("Nodes", str(overview['nodes'])) + summary_table.add_row("Availability Zones", str(overview['zones'])) + summary_table.add_row("Total Shards", str(overview['total_shards'])) + summary_table.add_row("Primary Shards", str(overview['primary_shards'])) + summary_table.add_row("Replica Shards", str(overview['replica_shards'])) + summary_table.add_row("Total Size", format_size(overview['total_size_gb'])) + + console.print(summary_table) + console.print() + + # Disk watermarks table + if overview.get('watermarks'): + watermarks_table = Table(title="Disk Allocation Watermarks", box=box.ROUNDED) + watermarks_table.add_column("Setting", style="cyan") + watermarks_table.add_column("Value", style="magenta") + + watermarks = overview['watermarks'] + watermarks_table.add_row("Low Watermark", str(watermarks.get('low', 'Not set'))) + watermarks_table.add_row("High Watermark", str(watermarks.get('high', 'Not set'))) + watermarks_table.add_row("Flood Stage", str(watermarks.get('flood_stage', 'Not set'))) + watermarks_table.add_row("Enable for Single Node", str(watermarks.get('enable_for_single_data_node', 'Not set'))) + + console.print(watermarks_table) + console.print() + + # Zone distribution table + zone_table = Table(title="Zone Distribution", box=box.ROUNDED) + zone_table.add_column("Zone", style="cyan") + zone_table.add_column("Shards", justify="right", style="magenta") + zone_table.add_column("Percentage", justify="right", style="green") + + total_shards = overview['total_shards'] + for zone, count in overview['zone_distribution'].items(): + percentage = (count / total_shards * 100) if total_shards > 0 else 0 + zone_table.add_row(zone, str(count), f"{percentage:.1f}%") + + console.print(zone_table) + console.print() + + # Node health table + node_table = Table(title="Node Health", box=box.ROUNDED) + node_table.add_column("Node", style="cyan") + node_table.add_column("Zone", style="blue") + node_table.add_column("Shards", justify="right", style="magenta") + node_table.add_column("Size", justify="right", style="green") + node_table.add_column("Disk Usage", justify="right") + node_table.add_column("Available Space", justify="right", style="green") + node_table.add_column("Until Low WM", justify="right", style="yellow") + node_table.add_column("Until High WM", justify="right", style="red") + + for node_info in overview['node_health']: + # Format watermark remaining capacity + low_wm_remaining = format_size(node_info['remaining_to_low_watermark_gb']) if node_info['remaining_to_low_watermark_gb'] > 0 else "[red]Exceeded[/red]" + high_wm_remaining = format_size(node_info['remaining_to_high_watermark_gb']) if node_info['remaining_to_high_watermark_gb'] > 0 else "[red]Exceeded[/red]" + + node_table.add_row( + node_info['name'], + node_info['zone'], + str(node_info['shards']), + format_size(node_info['size_gb']), + format_percentage(node_info['disk_usage_percent']), + format_size(node_info['available_space_gb']), + low_wm_remaining, + high_wm_remaining + ) + + console.print(node_table) + + # Table-specific analysis if requested + if table: + console.print() + console.print(Panel.fit(f"[bold blue]Analysis for table: {table}[/bold blue]")) + + stats = analyzer.analyze_distribution(table) + + table_summary = Table(title=f"Table {table} Distribution", box=box.ROUNDED) + table_summary.add_column("Metric", style="cyan") + table_summary.add_column("Value", style="magenta") + + table_summary.add_row("Total Shards", str(stats.total_shards)) + table_summary.add_row("Total Size", format_size(stats.total_size_gb)) + table_summary.add_row("Zone Balance Score", f"{stats.zone_balance_score:.1f}/100") + table_summary.add_row("Node Balance Score", f"{stats.node_balance_score:.1f}/100") + + console.print(table_summary) + + +@main.command() +@click.option('--table', '-t', help='Find candidates for specific table only') +@click.option('--min-size', default=40.0, help='Minimum shard size in GB (default: 40)') +@click.option('--max-size', default=60.0, help='Maximum shard size in GB (default: 60)') +@click.option('--limit', default=20, help='Maximum number of candidates to show (default: 20)') +@click.option('--node', help='Only show candidates from this specific source node (e.g., data-hot-4)') +@click.pass_context +def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, limit: int, node: Optional[str]): + """Find shard candidates for movement based on size criteria + + Results are sorted by nodes with least available space first, + then by shard size (smallest first) for easier moves. + """ + client = ctx.obj['client'] + analyzer = ShardAnalyzer(client) + + console.print(Panel.fit(f"[bold blue]Finding Moveable Shards ({min_size}-{max_size}GB)[/bold blue]")) + + if node: + console.print(f"[dim]Filtering: Only showing candidates from source node '{node}'[/dim]") + + # Find moveable candidates (only healthy shards suitable for operations) + candidates = analyzer.find_moveable_shards(min_size, max_size, table) + + # Filter by node if specified + if node: + candidates = [c for c in candidates if c.node_name == node] + + if not candidates: + if node: + console.print(f"[yellow]No moveable shards found on node '{node}' in the specified size range.[/yellow]") + console.print(f"[dim]Tip: Try different size ranges or remove --node filter to see all candidates[/dim]") + else: + console.print("[yellow]No moveable shards found in the specified size range.[/yellow]") + return + + # Show limited results + shown_candidates = candidates[:limit] + + candidates_table = Table(title=f"Moveable Shard Candidates (showing {len(shown_candidates)} of {len(candidates)})", box=box.ROUNDED) + candidates_table.add_column("Table", style="cyan") + candidates_table.add_column("Shard ID", justify="right", style="magenta") + candidates_table.add_column("Type", style="blue") + candidates_table.add_column("Node", style="green") + candidates_table.add_column("Zone", style="yellow") + candidates_table.add_column("Size", justify="right", style="red") + candidates_table.add_column("Node Free Space", justify="right", style="white") + candidates_table.add_column("Documents", justify="right", style="dim") + + # Create a mapping of node names to available space for display + node_space_map = {node.name: node.available_space_gb for node in analyzer.nodes} + + for shard in shown_candidates: + node_free_space = node_space_map.get(shard.node_name, 0) + candidates_table.add_row( + f"{shard.schema_name}.{shard.table_name}", + str(shard.shard_id), + shard.shard_type, + shard.node_name, + shard.zone, + format_size(shard.size_gb), + format_size(node_free_space), + f"{shard.num_docs:,}" + ) + + console.print(candidates_table) + + if len(candidates) > limit: + console.print(f"\n[dim]... and {len(candidates) - limit} more candidates[/dim]") + + +@main.command() +@click.option('--table', '-t', help='Generate recommendations for specific table only') +@click.option('--min-size', default=40.0, help='Minimum shard size in GB (default: 40)') +@click.option('--max-size', default=60.0, help='Maximum shard size in GB (default: 60)') +@click.option('--zone-tolerance', default=10.0, help='Zone balance tolerance percentage (default: 10)') +@click.option('--min-free-space', default=100.0, help='Minimum free space required on target nodes in GB (default: 100)') +@click.option('--max-moves', default=10, help='Maximum number of move recommendations (default: 10)') +@click.option('--max-disk-usage', default=90.0, help='Maximum disk usage percentage for target nodes (default: 90)') + +@click.option('--validate/--no-validate', default=True, help='Validate move safety (default: True)') +@click.option('--prioritize-space/--prioritize-zones', default=False, help='Prioritize available space over zone balancing (default: False)') +@click.option('--dry-run/--execute', default=True, help='Show what would be done without generating SQL commands (default: True)') +@click.option('--auto-execute', is_flag=True, default=False, help='DANGER: Automatically execute the SQL commands (requires --execute, asks for confirmation)') +@click.option('--node', help='Only recommend moves from this specific source node (e.g., data-hot-4)') +@click.pass_context +def recommend(ctx, table: Optional[str], min_size: float, max_size: float, + zone_tolerance: float, min_free_space: float, max_moves: int, max_disk_usage: float, validate: bool, prioritize_space: bool, dry_run: bool, auto_execute: bool, node: Optional[str]): + """Generate shard movement recommendations for rebalancing""" + client = ctx.obj['client'] + analyzer = ShardAnalyzer(client) + + # Safety check for auto-execute + if auto_execute and dry_run: + console.print("[red]❌ Error: --auto-execute requires --execute flag[/red]") + console.print("[dim]Use: --execute --auto-execute[/dim]") + return + + mode_text = "DRY RUN - Analysis Only" if dry_run else "EXECUTION MODE" + console.print(Panel.fit(f"[bold blue]Generating Rebalancing Recommendations[/bold blue] - [bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]")) + console.print("[dim]Note: Only analyzing healthy shards (STARTED + 100% recovered) for safe operations[/dim]") + console.print("[dim]Zone conflict detection: Prevents moves that would violate CrateDB's zone awareness[/dim]") + if prioritize_space: + console.print("[dim]Mode: Prioritizing available space over zone balancing[/dim]") + else: + console.print("[dim]Mode: Prioritizing zone balancing over available space[/dim]") + + if node: + console.print(f"[dim]Filtering: Only showing moves from source node '{node}'[/dim]") + + console.print(f"[dim]Safety thresholds: Max disk usage {max_disk_usage}%, Min free space {min_free_space}GB[/dim]") + + if dry_run: + console.print("[green]Running in DRY RUN mode - no SQL commands will be generated[/green]") + else: + console.print("[red]EXECUTION MODE - SQL commands will be generated for actual moves[/red]") + console.print() + + recommendations = analyzer.generate_rebalancing_recommendations( + table_name=table, + min_size_gb=min_size, + max_size_gb=max_size, + zone_tolerance_percent=zone_tolerance, + min_free_space_gb=min_free_space, + max_recommendations=max_moves, + prioritize_space=prioritize_space, + source_node=node, + max_disk_usage_percent=max_disk_usage + ) + + if not recommendations: + if node: + console.print(f"[yellow]No safe recommendations found for node '{node}'[/yellow]") + console.print(f"[dim]This could be due to:[/dim]") + console.print(f"[dim] β€’ Zone conflicts preventing safe moves[/dim]") + console.print(f"[dim] β€’ Target nodes exceeding {max_disk_usage}% disk usage threshold[/dim]") + console.print(f"[dim] β€’ Insufficient free space on target nodes (need {min_free_space}GB)[/dim]") + console.print(f"[dim] β€’ No shards in size range {min_size}-{max_size}GB[/dim]") + console.print(f"[dim]Suggestions:[/dim]") + console.print(f"[dim] β€’ Try: --max-disk-usage 95 (allow higher disk usage)[/dim]") + console.print(f"[dim] β€’ Try: --min-free-space 50 (reduce space requirements)[/dim]") + console.print(f"[dim] β€’ Try: different size ranges or remove --node filter[/dim]") + else: + console.print("[green]No rebalancing recommendations needed. Cluster appears well balanced![/green]") + return + + # Show recommendations table + rec_table = Table(title=f"Rebalancing Recommendations ({len(recommendations)} moves)", box=box.ROUNDED) + rec_table.add_column("Table", style="cyan") + rec_table.add_column("Shard", justify="right", style="magenta") + rec_table.add_column("Type", style="blue") + rec_table.add_column("From Node", style="red") + rec_table.add_column("To Node", style="green") + rec_table.add_column("Target Free Space", justify="right", style="cyan") + rec_table.add_column("Zone Change", style="yellow") + rec_table.add_column("Size", justify="right", style="white") + rec_table.add_column("Reason", style="dim") + if validate: + rec_table.add_column("Safety Check", style="bold") + + # Create a mapping of node names to available space for display + node_space_map = {node.name: node.available_space_gb for node in analyzer.nodes} + + for rec in recommendations: + zone_change = f"{rec.from_zone} β†’ {rec.to_zone}" if rec.from_zone != rec.to_zone else rec.from_zone + target_free_space = node_space_map.get(rec.to_node, 0) + + row = [ + f"{rec.schema_name}.{rec.table_name}", + str(rec.shard_id), + rec.shard_type, + rec.from_node, + rec.to_node, + format_size(target_free_space), + zone_change, + format_size(rec.size_gb), + rec.reason + ] + + if validate: + is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=max_disk_usage) + safety_status = "[green]βœ“ SAFE[/green]" if is_safe else f"[red]βœ— {safety_msg}[/red]" + row.append(safety_status) + + rec_table.add_row(*row) + + console.print(rec_table) + console.print() + + # Generate SQL commands or show dry-run analysis + if dry_run: + console.print(Panel.fit("[bold yellow]Dry Run Analysis - No Commands Generated[/bold yellow]")) + console.print("[dim]# This is a dry run - showing what would be recommended[/dim]") + console.print("[dim]# Use --execute flag to generate actual SQL commands[/dim]") + console.print() + + safe_moves = 0 + zone_conflicts = 0 + space_issues = 0 + + for i, rec in enumerate(recommendations, 1): + if validate: + is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=max_disk_usage) + if not is_safe: + if "zone conflict" in safety_msg.lower(): + zone_conflicts += 1 + console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") + elif "space" in safety_msg.lower(): + space_issues += 1 + console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") + else: + console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") + continue + safe_moves += 1 + + console.print(f"[green]βœ“ Move {i}: WOULD EXECUTE - {rec.reason}[/green]") + console.print(f"[dim] Target SQL: {rec.to_sql()}[/dim]") + + console.print() + console.print(f"[bold]Dry Run Summary:[/bold]") + console.print(f" β€’ Safe moves that would execute: [green]{safe_moves}[/green]") + console.print(f" β€’ Zone conflicts prevented: [yellow]{zone_conflicts}[/yellow]") + console.print(f" β€’ Space-related issues: [yellow]{space_issues}[/yellow]") + if safe_moves > 0: + console.print(f"\n[green]βœ“ Ready to execute {safe_moves} safe moves. Use --execute to generate SQL commands.[/green]") + else: + console.print(f"\n[yellow]⚠ No safe moves identified. Review cluster balance or adjust parameters.[/yellow]") + else: + console.print(Panel.fit("[bold green]Generated SQL Commands[/bold green]")) + console.print("[dim]# Copy and paste these commands to execute the moves[/dim]") + console.print("[dim]# ALWAYS test in a non-production environment first![/dim]") + console.print("[dim]# These commands only operate on healthy shards (STARTED + fully recovered)[/dim]") + console.print("[dim]# Commands use quoted identifiers for schema and table names[/dim]") + console.print() + + safe_moves = 0 + zone_conflicts = 0 + for i, rec in enumerate(recommendations, 1): + if validate: + is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=max_disk_usage) + if not is_safe: + if "Zone conflict" in safety_msg: + zone_conflicts += 1 + console.print(f"-- Move {i}: SKIPPED - {safety_msg}") + console.print(f"-- Tip: Try moving to a different zone or check existing shard distribution") + else: + console.print(f"-- Move {i}: SKIPPED - {safety_msg}") + continue + safe_moves += 1 + + console.print(f"-- Move {i}: {rec.reason}") + console.print(f"{rec.to_sql()}") + console.print() + + # Auto-execution if requested + if auto_execute: + _execute_recommendations_safely(client, recommendations, validate) + + if validate and safe_moves < len(recommendations): + if zone_conflicts > 0: + console.print(f"[yellow]Warning: {zone_conflicts} moves skipped due to zone conflicts[/yellow]") + console.print(f"[yellow]Tip: Use 'find-candidates' to see current shard distribution across zones[/yellow]") + console.print(f"[yellow]Warning: Only {safe_moves} of {len(recommendations)} moves passed safety validation[/yellow]") + + +@main.command() +@click.option('--connection-string', help='Override connection string from .env') +@click.pass_context +def test_connection(ctx, connection_string: Optional[str]): + """Test connection to CrateDB cluster""" + try: + if connection_string: + client = CrateDBClient(connection_string) + else: + client = CrateDBClient() + + if client.test_connection(): + console.print("[green]βœ“ Connection successful![/green]") + + # Get basic cluster info + nodes = client.get_nodes_info() + console.print(f"Connected to cluster with {len(nodes)} nodes:") + for node in nodes: + console.print(f" β€’ {node.name} (zone: {node.zone})") + else: + console.print("[red]βœ— Connection failed[/red]") + sys.exit(1) + + except Exception as e: + console.print(f"[red]βœ— Connection error: {e}[/red]") + sys.exit(1) + + +@main.command() +@click.option('--table', '-t', help='Check balance for specific table only') +@click.option('--tolerance', default=10.0, help='Zone balance tolerance percentage (default: 10)') +@click.pass_context +def check_balance(ctx, table: Optional[str], tolerance: float): + """Check zone balance for shards""" + client = ctx.obj['client'] + analyzer = ShardAnalyzer(client) + + console.print(Panel.fit("[bold blue]Zone Balance Check[/bold blue]")) + console.print("[dim]Note: Analyzing all shards regardless of state for complete cluster view[/dim]") + console.print() + + zone_stats = analyzer.check_zone_balance(table, tolerance) + + if not zone_stats: + console.print("[yellow]No shards found for analysis[/yellow]") + return + + # Calculate totals and targets + total_shards = sum(stats['TOTAL'] for stats in zone_stats.values()) + zones = list(zone_stats.keys()) + target_per_zone = total_shards // len(zones) if zones else 0 + tolerance_range = ( + target_per_zone * (1 - tolerance / 100), + target_per_zone * (1 + tolerance / 100) + ) + + balance_table = Table(title=f"Zone Balance Analysis (Target: {target_per_zone} Β±{tolerance}%)", box=box.ROUNDED) + balance_table.add_column("Zone", style="cyan") + balance_table.add_column("Primary", justify="right", style="blue") + balance_table.add_column("Replica", justify="right", style="green") + balance_table.add_column("Total", justify="right", style="magenta") + balance_table.add_column("Status", style="bold") + + for zone, stats in zone_stats.items(): + total = stats['TOTAL'] + + if tolerance_range[0] <= total <= tolerance_range[1]: + status = "[green]βœ“ Balanced[/green]" + elif total < tolerance_range[0]: + status = f"[yellow]⚠ Under ({total - target_per_zone:+})[/yellow]" + else: + status = f"[red]⚠ Over ({total - target_per_zone:+})[/red]" + + balance_table.add_row( + zone, + str(stats['PRIMARY']), + str(stats['REPLICA']), + str(total), + status + ) + + console.print(balance_table) + + +@main.command() +@click.option('--table', '-t', help='Analyze zones for specific table only') +@click.option('--show-shards/--no-show-shards', default=False, help='Show individual shard details (default: False)') +@click.pass_context +def zone_analysis(ctx, table: Optional[str], show_shards: bool): + """Detailed analysis of zone distribution and potential conflicts""" + client = ctx.obj['client'] + + console.print(Panel.fit("[bold blue]Detailed Zone Analysis[/bold blue]")) + console.print("[dim]Comprehensive zone distribution analysis for CrateDB cluster[/dim]") + console.print() + + # Get all shards for analysis + shards = client.get_shards_info(table_name=table, for_analysis=True) + + if not shards: + console.print("[yellow]No shards found for analysis[/yellow]") + return + + # Organize by table and shard + tables = {} + for shard in shards: + table_key = f"{shard.schema_name}.{shard.table_name}" + if table_key not in tables: + tables[table_key] = {} + + shard_key = shard.shard_id + if shard_key not in tables[table_key]: + tables[table_key][shard_key] = [] + + tables[table_key][shard_key].append(shard) + + # Analyze each table + zone_conflicts = 0 + under_replicated = 0 + + for table_name, table_shards in tables.items(): + console.print(f"\n[bold cyan]Table: {table_name}[/bold cyan]") + + # Create analysis table + analysis_table = Table(title=f"Shard Distribution for {table_name}", box=box.ROUNDED) + analysis_table.add_column("Shard ID", justify="right", style="magenta") + analysis_table.add_column("Primary Zone", style="blue") + analysis_table.add_column("Replica Zones", style="green") + analysis_table.add_column("Total Copies", justify="right", style="cyan") + analysis_table.add_column("Status", style="bold") + + for shard_id, shard_copies in sorted(table_shards.items()): + primary_zone = "Unknown" + replica_zones = set() + total_copies = len(shard_copies) + zones_with_copies = set() + + for shard_copy in shard_copies: + zones_with_copies.add(shard_copy.zone) + if shard_copy.is_primary: + primary_zone = shard_copy.zone + else: + replica_zones.add(shard_copy.zone) + + # Determine status + status_parts = [] + if len(zones_with_copies) == 1: + zone_conflicts += 1 + status_parts.append("[red]⚠ ZONE CONFLICT[/red]") + + if total_copies < 2: # Assuming we want at least 1 replica + under_replicated += 1 + status_parts.append("[yellow]⚠ Under-replicated[/yellow]") + + if not status_parts: + status_parts.append("[green]βœ“ Good[/green]") + + replica_zones_str = ", ".join(sorted(replica_zones)) if replica_zones else "None" + + analysis_table.add_row( + str(shard_id), + primary_zone, + replica_zones_str, + str(total_copies), + " ".join(status_parts) + ) + + # Show individual shard details if requested + if show_shards: + for shard_copy in shard_copies: + health_indicator = "βœ“" if shard_copy.routing_state == 'STARTED' else "⚠" + console.print(f" {health_indicator} {shard_copy.shard_type} on {shard_copy.node_name} ({shard_copy.zone}) - {shard_copy.routing_state}") + + console.print(analysis_table) + + # Summary + console.print(f"\n[bold]Zone Analysis Summary:[/bold]") + console.print(f" β€’ Tables analyzed: [cyan]{len(tables)}[/cyan]") + console.print(f" β€’ Zone conflicts detected: [red]{zone_conflicts}[/red]") + console.print(f" β€’ Under-replicated shards: [yellow]{under_replicated}[/yellow]") + + if zone_conflicts > 0: + console.print(f"\n[red]⚠ Found {zone_conflicts} zone conflicts that need attention![/red]") + console.print("[dim]Zone conflicts occur when all copies of a shard are in the same zone.[/dim]") + console.print("[dim]This violates CrateDB's zone-awareness and creates availability risks.[/dim]") + + if under_replicated > 0: + console.print(f"\n[yellow]⚠ Found {under_replicated} under-replicated shards.[/yellow]") + console.print("[dim]Consider increasing replication for better availability.[/dim]") + + if zone_conflicts == 0 and under_replicated == 0: + console.print("\n[green]βœ“ No critical zone distribution issues detected![/green]") + + +# @main.command() +# @click.argument('node_name') +# @click.option('--min-free-space', default=100.0, help='Minimum free space required on target nodes in GB (default: 100)') +# @click.option('--dry-run/--execute', default=True, help='Show decommission plan without generating SQL commands (default: True)') +# @click.pass_context +# def decommission(ctx, node_name: str, min_free_space: float, dry_run: bool): +# """Plan decommissioning of a node by analyzing required shard moves +# +# NODE_NAME: Name of the node to decommission +# """ +# client = ctx.obj['client'] +# analyzer = ShardAnalyzer(client) +# +# mode_text = "PLANNING MODE" if dry_run else "EXECUTION MODE" +# console.print(Panel.fit(f"[bold blue]Node Decommission Analysis[/bold blue] - [bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]")) +# console.print(f"[dim]Analyzing decommission plan for node: {node_name}[/dim]") +# console.print() +# +# # Generate decommission plan +# plan = analyzer.plan_node_decommission(node_name, min_free_space) +# +# if 'error' in plan: +# console.print(f"[red]Error: {plan['error']}[/red]") +# return +# +# # Display plan summary +# summary_table = Table(title=f"Decommission Plan for {node_name}", box=box.ROUNDED) +# summary_table.add_column("Metric", style="cyan") +# summary_table.add_column("Value", style="magenta") +# +# summary_table.add_row("Node", plan['node']) +# summary_table.add_row("Zone", plan['zone']) +# summary_table.add_row("Feasible", "[green]βœ“ Yes[/green]" if plan['feasible'] else "[red]βœ— No[/red]") +# summary_table.add_row("Shards to Move", str(plan['shards_to_move'])) +# summary_table.add_row("Moveable Shards", str(plan['moveable_shards'])) +# summary_table.add_row("Total Data Size", format_size(plan['total_size_gb'])) +# summary_table.add_row("Estimated Time", f"{plan['estimated_time_hours']:.1f} hours") +# +# console.print(summary_table) +# console.print() +# +# # Show warnings if any +# if plan['warnings']: +# console.print("[bold yellow]⚠ Warnings:[/bold yellow]") +# for warning in plan['warnings']: +# console.print(f" β€’ [yellow]{warning}[/yellow]") +# console.print() +# +# # Show infeasible moves if any +# if plan['infeasible_moves']: +# console.print("[bold red]βœ— Cannot Move:[/bold red]") +# infeasible_table = Table(box=box.ROUNDED) +# infeasible_table.add_column("Shard", style="cyan") +# infeasible_table.add_column("Size", style="magenta") +# infeasible_table.add_column("Reason", style="red") +# +# for move in plan['infeasible_moves']: +# infeasible_table.add_row( +# move['shard'], +# format_size(move['size_gb']), +# move['reason'] +# ) +# console.print(infeasible_table) +# console.print() +# +# # Show move recommendations +# if plan['recommendations']: +# move_table = Table(title="Required Shard Moves", box=box.ROUNDED) +# move_table.add_column("Table", style="cyan") +# move_table.add_column("Shard", justify="right", style="magenta") +# move_table.add_column("Type", style="blue") +# move_table.add_column("Size", style="green") +# move_table.add_column("From Zone", style="yellow") +# move_table.add_column("To Node", style="cyan") +# move_table.add_column("To Zone", style="yellow") +# +# for rec in plan['recommendations']: +# move_table.add_row( +# f"{rec.schema_name}.{rec.table_name}", +# str(rec.shard_id), +# rec.shard_type, +# format_size(rec.size_gb), +# rec.from_zone, +# rec.to_node, +# rec.to_zone +# ) +# +# console.print(move_table) +# console.print() +# +# # Generate SQL commands if not in dry-run mode +# if not dry_run and plan['feasible']: +# console.print(Panel.fit("[bold green]Decommission SQL Commands[/bold green]")) +# console.print("[dim]# Execute these commands in order to prepare for node decommission[/dim]") +# console.print("[dim]# ALWAYS test in a non-production environment first![/dim]") +# console.print("[dim]# Monitor shard health after each move before proceeding[/dim]") +# console.print() +# +# for i, rec in enumerate(plan['recommendations'], 1): +# console.print(f"-- Move {i}: {rec.reason}") +# console.print(f"{rec.to_sql()}") +# console.print() +# +# console.print(f"-- After all moves complete, the node {node_name} can be safely removed") +# console.print(f"-- Total moves required: {len(plan['recommendations'])}") +# elif dry_run: +# console.print("[green]βœ“ Decommission plan ready. Use --execute to generate SQL commands.[/green]") +# +# # Final status +# if not plan['feasible']: +# console.print(f"[red]⚠ Node {node_name} cannot be safely decommissioned at this time.[/red]") +# console.print("[dim]Address the issues above before attempting decommission.[/dim]") +# elif plan['shards_to_move'] == 0: +# console.print(f"[green]βœ“ Node {node_name} is ready for immediate decommission (no shards to move).[/green]") +# else: +# console.print(f"[green]βœ“ Node {node_name} can be safely decommissioned after moving {len(plan['recommendations'])} shards.[/green]") + + +@main.command() +@click.argument('schema_table') +@click.argument('shard_id', type=int) +@click.argument('from_node') +@click.argument('to_node') +@click.option('--max-disk-usage', default=90.0, help='Maximum disk usage percentage for target node (default: 90)') + +@click.pass_context +def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node: str, max_disk_usage: float): + """Validate a specific shard move before execution + + SCHEMA_TABLE: Schema and table name (format: schema.table) + SHARD_ID: Shard ID to move + FROM_NODE: Source node name + TO_NODE: Target node name + + Example: xmover validate-move CUROV.maddoxxFormfactor 4 data-hot-1 data-hot-3 + """ + client = ctx.obj['client'] + analyzer = ShardAnalyzer(client) + + # Parse schema and table + if '.' not in schema_table: + console.print("[red]Error: Schema and table must be in format 'schema.table'[/red]") + return + + schema_name, table_name = schema_table.split('.', 1) + + console.print(Panel.fit(f"[bold blue]Validating Shard Move[/bold blue]")) + console.print(f"[dim]Move: {schema_name}.{table_name}[{shard_id}] from {from_node} to {to_node}[/dim]") + console.print() + + # Find the nodes + from_node_info = None + to_node_info = None + for node in analyzer.nodes: + if node.name == from_node: + from_node_info = node + if node.name == to_node: + to_node_info = node + + if not from_node_info: + console.print(f"[red]βœ— Source node '{from_node}' not found in cluster[/red]") + return + + if not to_node_info: + console.print(f"[red]βœ— Target node '{to_node}' not found in cluster[/red]") + return + + # Find the specific shard + target_shard = None + for shard in analyzer.shards: + if (shard.schema_name == schema_name and + shard.table_name == table_name and + shard.shard_id == shard_id and + shard.node_name == from_node): + target_shard = shard + break + + if not target_shard: + console.print(f"[red]βœ— Shard {shard_id} not found on node {from_node}[/red]") + console.print(f"[dim]Use 'xmover find-candidates' to see available shards[/dim]") + return + + # Create a move recommendation for validation + recommendation = MoveRecommendation( + table_name=table_name, + schema_name=schema_name, + shard_id=shard_id, + from_node=from_node, + to_node=to_node, + from_zone=from_node_info.zone, + to_zone=to_node_info.zone, + shard_type=target_shard.shard_type, + size_gb=target_shard.size_gb, + reason="Manual validation" + ) + + # Display shard details + details_table = Table(title="Shard Details", box=box.ROUNDED) + details_table.add_column("Property", style="cyan") + details_table.add_column("Value", style="magenta") + + details_table.add_row("Table", f"{schema_name}.{table_name}") + details_table.add_row("Shard ID", str(shard_id)) + details_table.add_row("Type", target_shard.shard_type) + details_table.add_row("Size", format_size(target_shard.size_gb)) + details_table.add_row("Documents", f"{target_shard.num_docs:,}") + details_table.add_row("State", target_shard.state) + details_table.add_row("Routing State", target_shard.routing_state) + details_table.add_row("From Node", f"{from_node} ({from_node_info.zone})") + details_table.add_row("To Node", f"{to_node} ({to_node_info.zone})") + details_table.add_row("Zone Change", "Yes" if from_node_info.zone != to_node_info.zone else "No") + + console.print(details_table) + console.print() + + # Perform comprehensive validation + is_safe, safety_msg = analyzer.validate_move_safety(recommendation, max_disk_usage_percent=max_disk_usage) + + if is_safe: + console.print("[green]βœ“ VALIDATION PASSED - Move appears safe[/green]") + console.print(f"[green]βœ“ {safety_msg}[/green]") + console.print() + + # Show the SQL command + console.print(Panel.fit("[bold green]Ready to Execute[/bold green]")) + console.print("[dim]# Copy and paste this command to execute the move[/dim]") + console.print() + console.print(f"{recommendation.to_sql()}") + console.print() + console.print("[dim]# Monitor shard health after execution[/dim]") + console.print("[dim]# Check with: SELECT * FROM sys.shards WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]") + else: + console.print("[red]βœ— VALIDATION FAILED - Move not safe[/red]") + console.print(f"[red]βœ— {safety_msg}[/red]") + console.print() + + # Provide troubleshooting guidance + if "zone conflict" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Zone Conflicts:[/yellow]") + console.print(" β€’ Check current shard distribution: xmover zone-analysis --show-shards") + console.print(" β€’ Try moving to a different zone") + console.print(" β€’ Verify cluster has proper zone-awareness configuration") + elif "node conflict" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Node Conflicts:[/yellow]") + console.print(" β€’ The target node already has a copy of this shard") + console.print(" β€’ Choose a different target node") + console.print(" β€’ Check shard distribution: xmover analyze") + elif "space" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Space Issues:[/yellow]") + console.print(" β€’ Free up space on the target node") + console.print(" β€’ Choose a node with more available capacity") + console.print(" β€’ Check node capacity: xmover analyze") + elif "usage" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting High Disk Usage:[/yellow]") + console.print(" β€’ Wait for target node disk usage to decrease") + console.print(" β€’ Choose a node with lower disk usage") + console.print(" β€’ Check cluster health: xmover analyze") + console.print(" β€’ Consider using --max-disk-usage option for urgent moves") + + +@main.command() +@click.argument('error_message', required=False) +@click.pass_context +def explain_error(ctx, error_message: Optional[str]): + """Explain CrateDB allocation error messages and provide solutions + + ERROR_MESSAGE: The CrateDB error message to analyze (optional - can be provided interactively) + + Example: xmover explain-error "NO(a copy of this shard is already allocated to this node)" + """ + console.print(Panel.fit("[bold blue]CrateDB Error Message Decoder[/bold blue]")) + console.print("[dim]Helps decode and troubleshoot CrateDB shard allocation errors[/dim]") + console.print() + + if not error_message: + console.print("Please paste the CrateDB error message (press Enter twice when done):") + lines = [] + while True: + try: + line = input() + if line.strip() == "" and lines: + break + lines.append(line) + except (EOFError, KeyboardInterrupt): + break + error_message = "\n".join(lines) + + if not error_message.strip(): + console.print("[yellow]No error message provided[/yellow]") + return + + console.print(f"[dim]Analyzing error message...[/dim]") + console.print() + + # Common CrateDB allocation error patterns and solutions + error_patterns = [ + { + "pattern": "a copy of this shard is already allocated to this node", + "title": "Node Already Has Shard Copy", + "explanation": "The target node already contains a copy (primary or replica) of this shard.", + "solutions": [ + "Choose a different target node that doesn't have this shard", + "Use 'xmover zone-analysis --show-shards' to see current distribution", + "Verify the shard ID and table name are correct" + ], + "prevention": "Always check current shard locations before moving" + }, + { + "pattern": "there are too many copies of the shard allocated to nodes with attribute", + "title": "Zone Allocation Limit Exceeded", + "explanation": "CrateDB's zone awareness prevents too many copies in the same zone.", + "solutions": [ + "Move the shard to a different availability zone", + "Check zone balance with 'xmover check-balance'", + "Ensure target zone doesn't already have copies of this shard" + ], + "prevention": "Use 'xmover recommend' which respects zone constraints" + }, + { + "pattern": "not enough disk space", + "title": "Insufficient Disk Space", + "explanation": "The target node doesn't have enough free disk space for the shard.", + "solutions": [ + "Free up space on the target node", + "Choose a node with more available capacity", + "Check available space with 'xmover analyze'" + ], + "prevention": "Use '--min-free-space' parameter in recommendations" + }, + { + "pattern": "shard recovery limit", + "title": "Recovery Limit Exceeded", + "explanation": "Too many shards are currently being moved/recovered simultaneously.", + "solutions": [ + "Wait for current recoveries to complete", + "Check recovery status in CrateDB admin UI", + "Reduce concurrent recoveries in cluster settings" + ], + "prevention": "Move shards gradually, monitor recovery progress" + }, + { + "pattern": "allocation is disabled", + "title": "Allocation Disabled", + "explanation": "Shard allocation is temporarily disabled in the cluster.", + "solutions": [ + "Re-enable allocation: PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":\"all\"}}", + "Check if allocation was disabled for maintenance", + "Verify cluster health before re-enabling" + ], + "prevention": "Check allocation status before performing moves" + } + ] + + # Find matching patterns + matches = [] + error_lower = error_message.lower() + + for pattern_info in error_patterns: + if pattern_info["pattern"].lower() in error_lower: + matches.append(pattern_info) + + if matches: + for i, match in enumerate(matches): + if i > 0: + console.print("\n" + "─" * 60 + "\n") + + console.print(f"[bold red]🚨 {match['title']}[/bold red]") + console.print(f"[yellow]πŸ“ Explanation:[/yellow] {match['explanation']}") + console.print() + + console.print("[green]πŸ’‘ Solutions:[/green]") + for j, solution in enumerate(match['solutions'], 1): + console.print(f" {j}. {solution}") + console.print() + + console.print(f"[blue]πŸ›‘οΈ Prevention:[/blue] {match['prevention']}") + else: + console.print("[yellow]⚠ No specific pattern match found[/yellow]") + console.print() + console.print("[bold]General Troubleshooting Steps:[/bold]") + console.print("1. Check current shard distribution: [cyan]xmover analyze[/cyan]") + console.print("2. Validate the specific move: [cyan]xmover validate-move schema.table shard_id from_node to_node[/cyan]") + console.print("3. Check zone conflicts: [cyan]xmover zone-analysis --show-shards[/cyan]") + console.print("4. Verify node capacity: [cyan]xmover analyze[/cyan]") + console.print("5. Review CrateDB documentation on shard allocation") + + console.print() + console.print("[dim]πŸ’‘ Tip: Use 'xmover validate-move' to check moves before execution[/dim]") + console.print("[dim]πŸ“š For more help: https://crate.io/docs/crate/reference/en/latest/admin/system-information.html[/dim]") + + +@main.command() +@click.option('--table', '-t', help='Monitor recovery for specific table only') +@click.option('--node', '-n', help='Monitor recovery on specific node only') +@click.option('--watch', '-w', is_flag=True, help='Continuously monitor (refresh every 10s)') +@click.option('--refresh-interval', default=10, help='Refresh interval for watch mode (seconds)') +@click.option('--recovery-type', type=click.Choice(['PEER', 'DISK', 'all']), default='all', help='Filter by recovery type') +@click.option('--include-transitioning', is_flag=True, help='Include completed recoveries still in transitioning state') +@click.pass_context +def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: int, recovery_type: str, include_transitioning: bool): + """Monitor active shard recovery operations on the cluster + + This command monitors ongoing shard recoveries by querying sys.allocations + and sys.shards tables. It shows recovery progress, type (PEER/DISK), and timing. + + By default, only shows actively progressing recoveries. Use --include-transitioning + to also see completed recoveries that haven't fully transitioned to STARTED state. + + Examples: + xmover monitor-recovery # Show active recoveries only + xmover monitor-recovery --include-transitioning # Show active + transitioning + xmover monitor-recovery --table myTable # Monitor specific table + xmover monitor-recovery --watch # Continuous monitoring + xmover monitor-recovery --recovery-type PEER # Only PEER recoveries + """ + try: + client = ctx.obj['client'] + recovery_monitor = RecoveryMonitor(client) + + if watch: + + console.print(f"πŸ”„ Monitoring shard recoveries (refreshing every {refresh_interval}s)") + console.print("Press Ctrl+C to stop") + console.print() + + try: + # Show header once + console.print("πŸ“Š Recovery Progress Monitor") + console.print("=" * 80) + + # Track previous state for change detection + previous_recoveries = {} + previous_timestamp = None + first_run = True + + while True: + # Get current recovery status + recoveries = recovery_monitor.get_cluster_recovery_status( + table_name=table, + node_name=node, + recovery_type_filter=recovery_type, + include_transitioning=include_transitioning + ) + + # Display current time + from datetime import datetime + current_time = datetime.now().strftime("%H:%M:%S") + + # Check for any changes + changes = [] + active_count = 0 + completed_count = 0 + + for recovery in recoveries: + recovery_key = f"{recovery.schema_name}.{recovery.table_name}.{recovery.shard_id}.{recovery.node_name}" + + # Create complete table name + if recovery.schema_name == "doc": + table_display = recovery.table_name + else: + table_display = f"{recovery.schema_name}.{recovery.table_name}" + + # Count active vs completed + if recovery.stage == "DONE" and recovery.overall_progress >= 100.0: + completed_count += 1 + else: + active_count += 1 + + # Check for changes since last update + if recovery_key in previous_recoveries: + prev = previous_recoveries[recovery_key] + if prev['progress'] != recovery.overall_progress: + diff = recovery.overall_progress - prev['progress'] + # Create node route display + node_route = "" + if recovery.recovery_type == "PEER" and recovery.source_node_name: + node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" + elif recovery.recovery_type == "DISK": + node_route = f" disk β†’ {recovery.node_name}" + + # Add translog info + translog_info = format_translog_info(recovery) + + if diff > 0: + changes.append(f"[green]πŸ“ˆ[/green] {table_display} S{recovery.shard_id} {recovery.overall_progress:.1f}% (+{diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}") + else: + changes.append(f"[yellow]πŸ“‰[/yellow] {table_display} S{recovery.shard_id} {recovery.overall_progress:.1f}% ({diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}") + elif prev['stage'] != recovery.stage: + # Create node route display + node_route = "" + if recovery.recovery_type == "PEER" and recovery.source_node_name: + node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" + elif recovery.recovery_type == "DISK": + node_route = f" disk β†’ {recovery.node_name}" + + # Add translog info + translog_info = format_translog_info(recovery) + + changes.append(f"[blue]πŸ”„[/blue] {table_display} S{recovery.shard_id} {prev['stage']}β†’{recovery.stage} {recovery.size_gb:.1f}GB{translog_info}{node_route}") + else: + # New recovery - show based on include_transitioning flag or first run + if first_run or include_transitioning or (recovery.overall_progress < 100.0 or recovery.stage != "DONE"): + # Create node route display + node_route = "" + if recovery.recovery_type == "PEER" and recovery.source_node_name: + node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" + elif recovery.recovery_type == "DISK": + node_route = f" disk β†’ {recovery.node_name}" + + status_icon = "[cyan]πŸ†•[/cyan]" if not first_run else "[blue]πŸ“‹[/blue]" + # Add translog info + translog_info = format_translog_info(recovery) + + changes.append(f"{status_icon} {table_display} S{recovery.shard_id} {recovery.stage} {recovery.overall_progress:.1f}% {recovery.size_gb:.1f}GB{translog_info}{node_route}") + + # Store current state for next comparison + previous_recoveries[recovery_key] = { + 'progress': recovery.overall_progress, + 'stage': recovery.stage + } + + # Always show a status line + if not recoveries: + console.print(f"{current_time} | [green]No recoveries - cluster stable[/green]") + previous_recoveries.clear() + else: + # Build status message + status = "" + if active_count > 0: + status = f"{active_count} active" + if completed_count > 0: + status += f", {completed_count} done" if status else f"{completed_count} done" + + # Show status line with changes or periodic update + if changes: + console.print(f"{current_time} | {status}") + for change in changes: + console.print(f" | {change}") + else: + # Show periodic status even without changes + if include_transitioning and completed_count > 0: + console.print(f"{current_time} | {status} (transitioning)") + elif active_count > 0: + console.print(f"{current_time} | {status} (no changes)") + + previous_timestamp = current_time + first_run = False + time.sleep(refresh_interval) + + except KeyboardInterrupt: + console.print("\n\n[yellow]⏹ Monitoring stopped by user[/yellow]") + + # Show final summary + final_recoveries = recovery_monitor.get_cluster_recovery_status( + table_name=table, + node_name=node, + recovery_type_filter=recovery_type, + include_transitioning=include_transitioning + ) + + if final_recoveries: + console.print("\nπŸ“Š [bold]Final Recovery Summary:[/bold]") + summary = recovery_monitor.get_recovery_summary(final_recoveries) + + # Count active vs completed + active_count = len([r for r in final_recoveries if r.overall_progress < 100.0 or r.stage != "DONE"]) + completed_count = len(final_recoveries) - active_count + + console.print(f" Total recoveries: {summary['total_recoveries']}") + console.print(f" Active: {active_count}, Completed: {completed_count}") + console.print(f" Total size: {summary['total_size_gb']:.1f} GB") + console.print(f" Average progress: {summary['avg_progress']:.1f}%") + + if summary['by_type']: + console.print(f" By recovery type:") + for rec_type, stats in summary['by_type'].items(): + console.print(f" {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress") + else: + console.print("\n[green]βœ… No active recoveries at exit[/green]") + + return + + else: + # Single status check + recoveries = recovery_monitor.get_cluster_recovery_status( + table_name=table, + node_name=node, + recovery_type_filter=recovery_type, + include_transitioning=include_transitioning + ) + + display_output = recovery_monitor.format_recovery_display(recoveries) + console.print(display_output) + + if not recoveries: + if include_transitioning: + console.print("\n[green]βœ… No recoveries found (active or transitioning)[/green]") + else: + console.print("\n[green]βœ… No active recoveries found[/green]") + console.print("[dim]πŸ’‘ Use --include-transitioning to see completed recoveries still transitioning[/dim]") + else: + # Show summary + summary = recovery_monitor.get_recovery_summary(recoveries) + console.print(f"\nπŸ“Š [bold]Recovery Summary:[/bold]") + console.print(f" Total recoveries: {summary['total_recoveries']}") + console.print(f" Total size: {summary['total_size_gb']:.1f} GB") + console.print(f" Average progress: {summary['avg_progress']:.1f}%") + + # Show breakdown by type + if summary['by_type']: + console.print(f"\n By recovery type:") + for rec_type, stats in summary['by_type'].items(): + console.print(f" {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress") + + console.print(f"\n[dim]πŸ’‘ Use --watch flag for continuous monitoring[/dim]") + + except Exception as e: + console.print(f"[red]❌ Error monitoring recoveries: {e}[/red]") + if ctx.obj.get('debug'): + raise + + +def _wait_for_recovery_capacity(client, max_concurrent_recoveries: int = 5): + """Wait until active recovery count is below threshold""" + from xmover.analyzer import RecoveryMonitor + from time import sleep + + recovery_monitor = RecoveryMonitor(client) + wait_time = 0 + + while True: + # Check active recoveries (including transitioning) + recoveries = recovery_monitor.get_cluster_recovery_status(include_transitioning=True) + active_count = len([r for r in recoveries if r.overall_progress < 100.0 or r.stage != "DONE"]) + + if active_count < max_concurrent_recoveries: + if wait_time > 0: + console.print(f" [green]βœ“ Recovery capacity available ({active_count}/{max_concurrent_recoveries} active)[/green]") + break + else: + if wait_time == 0: + console.print(f" [yellow]⏳ Waiting for recovery capacity... ({active_count}/{max_concurrent_recoveries} active)[/yellow]") + elif wait_time % 30 == 0: # Update every 30 seconds + console.print(f" [yellow]⏳ Still waiting... ({active_count}/{max_concurrent_recoveries} active)[/yellow]") + + sleep(10) # Check every 10 seconds + wait_time += 10 + + +def _execute_recommendations_safely(client, recommendations, validate: bool): + """Execute recommendations with extensive safety measures""" + from time import sleep + import sys + from xmover.analyzer import ShardAnalyzer + + # Filter to only safe recommendations + safe_recommendations = [] + if validate: + analyzer = ShardAnalyzer(client) + for rec in recommendations: + is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=95.0) + if is_safe: + safe_recommendations.append(rec) + else: + safe_recommendations = recommendations + + if not safe_recommendations: + console.print("[yellow]⚠ No safe recommendations to execute[/yellow]") + return + + console.print(f"\n[bold red]🚨 AUTO-EXECUTION MODE 🚨[/bold red]") + console.print(f"About to execute {len(safe_recommendations)} shard moves automatically:") + console.print() + + # Show what will be executed + for i, rec in enumerate(safe_recommendations, 1): + table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name + console.print(f" {i}. {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB) {rec.from_node} β†’ {rec.to_node}") + + console.print() + console.print("[bold yellow]⚠ SAFETY WARNINGS:[/bold yellow]") + console.print(" β€’ These commands will immediately start shard movements") + console.print(" β€’ Each move will temporarily impact cluster performance") + console.print(" β€’ Recovery time depends on shard size and network speed") + console.print(" β€’ You should monitor progress with: xmover monitor-recovery --watch") + console.print() + + # Double confirmation + try: + response1 = input("Type 'EXECUTE' to proceed with automatic execution: ").strip() + if response1 != "EXECUTE": + console.print("[yellow]❌ Execution cancelled[/yellow]") + return + + response2 = input(f"Confirm: Execute {len(safe_recommendations)} shard moves? (yes/no): ").strip().lower() + if response2 not in ['yes', 'y']: + console.print("[yellow]❌ Execution cancelled[/yellow]") + return + + except KeyboardInterrupt: + console.print("\n[yellow]❌ Execution cancelled by user[/yellow]") + return + + console.print(f"\nπŸš€ [bold green]Executing {len(safe_recommendations)} shard moves...[/bold green]") + console.print() + + successful_moves = 0 + failed_moves = 0 + + for i, rec in enumerate(safe_recommendations, 1): + table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name + sql_command = rec.to_sql() + + console.print(f"[{i}/{len(safe_recommendations)}] Executing: {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB)") + console.print(f" {rec.from_node} β†’ {rec.to_node}") + + try: + # Execute the SQL command + result = client.execute_query(sql_command) + + if result.get('rowcount', 0) >= 0: # Success indicator for ALTER statements + console.print(f" [green]βœ… SUCCESS[/green] - Move initiated") + successful_moves += 1 + + # Smart delay: check active recoveries before next move + if i < len(safe_recommendations): + _wait_for_recovery_capacity(client, max_concurrent_recoveries=5) + else: + console.print(f" [red]❌ FAILED[/red] - Unexpected result: {result}") + failed_moves += 1 + + except Exception as e: + console.print(f" [red]❌ FAILED[/red] - Error: {e}") + failed_moves += 1 + + # Ask whether to continue after a failure + if i < len(safe_recommendations): + try: + continue_response = input(f" Continue with remaining {len(safe_recommendations) - i} moves? (yes/no): ").strip().lower() + if continue_response not in ['yes', 'y']: + console.print("[yellow]⏹ Execution stopped by user[/yellow]") + break + except KeyboardInterrupt: + console.print("\n[yellow]⏹ Execution stopped by user[/yellow]") + break + + console.print() + + # Final summary + console.print(f"πŸ“Š [bold]Execution Summary:[/bold]") + console.print(f" Successful moves: [green]{successful_moves}[/green]") + console.print(f" Failed moves: [red]{failed_moves}[/red]") + console.print(f" Total attempted: {successful_moves + failed_moves}") + + if successful_moves > 0: + console.print() + console.print("[green]βœ… Shard moves initiated successfully![/green]") + console.print("[dim]πŸ’‘ Monitor progress with:[/dim]") + console.print("[dim] xmover monitor-recovery --watch[/dim]") + console.print("[dim]πŸ’‘ Check cluster status with:[/dim]") + console.print("[dim] xmover analyze[/dim]") + + if failed_moves > 0: + console.print() + console.print(f"[yellow]⚠ {failed_moves} moves failed - check cluster status and retry if needed[/yellow]") + + +if __name__ == '__main__': + main() diff --git a/cratedb_toolkit/admin/xmover/database.py b/cratedb_toolkit/admin/xmover/database.py new file mode 100644 index 00000000..ec3a0098 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/database.py @@ -0,0 +1,584 @@ +""" +Database connection and query functions for CrateDB +""" + +import os +import json +import requests +from typing import Dict, List, Optional, Any +from dataclasses import dataclass +from dotenv import load_dotenv + + +@dataclass +class NodeInfo: + """Information about a CrateDB node""" + id: str + name: str + zone: str + heap_used: int + heap_max: int + fs_total: int + fs_used: int + fs_available: int + + @property + def heap_usage_percent(self) -> float: + return (self.heap_used / self.heap_max) * 100 if self.heap_max > 0 else 0 + + @property + def disk_usage_percent(self) -> float: + return (self.fs_used / self.fs_total) * 100 if self.fs_total > 0 else 0 + + @property + def available_space_gb(self) -> float: + return self.fs_available / (1024**3) + + +@dataclass +class ShardInfo: + """Information about a shard""" + table_name: str + schema_name: str + shard_id: int + node_id: str + node_name: str + zone: str + is_primary: bool + size_bytes: int + size_gb: float + num_docs: int + state: str + routing_state: str + + @property + def shard_type(self) -> str: + return "PRIMARY" if self.is_primary else "REPLICA" + + +@dataclass +class RecoveryInfo: + """Information about an active shard recovery""" + schema_name: str + table_name: str + shard_id: int + node_name: str + node_id: str + recovery_type: str # PEER, DISK, etc. + stage: str # INIT, INDEX, VERIFY_INDEX, TRANSLOG, FINALIZE, DONE + files_percent: float + bytes_percent: float + total_time_ms: int + routing_state: str # INITIALIZING, RELOCATING, etc. + current_state: str # from allocations + is_primary: bool + size_bytes: int + source_node_name: Optional[str] = None # Source node for PEER recoveries + translog_size_bytes: int = 0 # Translog size in bytes + + @property + def overall_progress(self) -> float: + """Calculate overall progress percentage""" + return max(self.files_percent, self.bytes_percent) + + @property + def size_gb(self) -> float: + """Size in GB""" + return self.size_bytes / (1024**3) + + @property + def shard_type(self) -> str: + return "PRIMARY" if self.is_primary else "REPLICA" + + @property + def total_time_seconds(self) -> float: + """Total time in seconds""" + return self.total_time_ms / 1000.0 + + @property + def translog_size_gb(self) -> float: + """Translog size in GB""" + return self.translog_size_bytes / (1024**3) + + @property + def translog_percentage(self) -> float: + """Translog size as percentage of shard size""" + return (self.translog_size_bytes / self.size_bytes * 100) if self.size_bytes > 0 else 0 + + +class CrateDBClient: + """Client for connecting to CrateDB and executing queries""" + + def __init__(self, connection_string: Optional[str] = None): + load_dotenv() + + self.connection_string = connection_string or os.getenv('CRATE_CONNECTION_STRING') + if not self.connection_string: + raise ValueError("CRATE_CONNECTION_STRING not found in environment or provided") + + self.username = os.getenv('CRATE_USERNAME') + self.password = os.getenv('CRATE_PASSWORD') + self.ssl_verify = os.getenv('CRATE_SSL_VERIFY', 'true').lower() == 'true' + + # Ensure connection string ends with _sql endpoint + if not self.connection_string.endswith('/_sql'): + self.connection_string = self.connection_string.rstrip('/') + '/_sql' + + def execute_query(self, query: str, parameters: Optional[List] = None) -> Dict[str, Any]: + """Execute a SQL query against CrateDB""" + payload = { + 'stmt': query + } + + if parameters: + payload['args'] = parameters + + auth = None + if self.username and self.password: + auth = (self.username, self.password) + + try: + response = requests.post( + self.connection_string, + json=payload, + auth=auth, + verify=self.ssl_verify, + timeout=30 + ) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + raise Exception(f"Failed to execute query: {e}") + + def get_nodes_info(self) -> List[NodeInfo]: + """Get information about all nodes in the cluster""" + query = """ + SELECT + id, + name, + attributes['zone'] as zone, + heap['used'] as heap_used, + heap['max'] as heap_max, + fs['total']['size'] as fs_total, + fs['total']['used'] as fs_used, + fs['total']['available'] as fs_available + FROM sys.nodes + WHERE name IS NOT NULL + ORDER BY name + """ + + result = self.execute_query(query) + nodes = [] + + for row in result.get('rows', []): + nodes.append(NodeInfo( + id=row[0], + name=row[1], + zone=row[2] or 'unknown', + heap_used=row[3] or 0, + heap_max=row[4] or 0, + fs_total=row[5] or 0, + fs_used=row[6] or 0, + fs_available=row[7] or 0 + )) + + return nodes + + def get_shards_info(self, table_name: Optional[str] = None, + min_size_gb: Optional[float] = None, + max_size_gb: Optional[float] = None, + for_analysis: bool = False) -> List[ShardInfo]: + """Get information about shards, optionally filtered by table and size + + Args: + table_name: Filter by specific table + min_size_gb: Minimum shard size in GB + max_size_gb: Maximum shard size in GB + for_analysis: If True, includes all shards regardless of state (for cluster analysis) + If False, only includes healthy shards suitable for operations + """ + + where_conditions = [] + if not for_analysis: + # For operations, only include healthy shards + where_conditions.extend([ + "s.routing_state = 'STARTED'", + "s.recovery['files']['percent'] = 100.0" + ]) + parameters = [] + + if table_name: + where_conditions.append("s.table_name = ?") + parameters.append(table_name) + + if min_size_gb is not None: + where_conditions.append("s.size >= ?") + parameters.append(int(min_size_gb * 1024**3)) # Convert GB to bytes + + if max_size_gb is not None: + where_conditions.append("s.size <= ?") + parameters.append(int(max_size_gb * 1024**3)) # Convert GB to bytes + + where_clause = "" + if where_conditions: + where_clause = f"WHERE {' AND '.join(where_conditions)}" + + query = f""" + SELECT + s.table_name, + s.schema_name, + s.id as shard_id, + s.node['id'] as node_id, + s.node['name'] as node_name, + n.attributes['zone'] as zone, + s."primary" as is_primary, + s.size as size_bytes, + s.size / 1024.0^3 as size_gb, + s.num_docs, + s.state, + s.routing_state + FROM sys.shards s + JOIN sys.nodes n ON s.node['id'] = n.id + {where_clause} + ORDER BY s.table_name, s.schema_name, s.id, s."primary" DESC + """ + + result = self.execute_query(query, parameters) + shards = [] + + for row in result.get('rows', []): + shards.append(ShardInfo( + table_name=row[0], + schema_name=row[1], + shard_id=row[2], + node_id=row[3], + node_name=row[4], + zone=row[5] or 'unknown', + is_primary=row[6], + size_bytes=row[7] or 0, + size_gb=float(row[8] or 0), + num_docs=row[9] or 0, + state=row[10], + routing_state=row[11] + )) + + return shards + + def get_shard_distribution_summary(self, for_analysis: bool = True) -> Dict[str, Any]: + """Get a summary of shard distribution across nodes and zones + + Args: + for_analysis: If True, includes all shards for complete cluster analysis + If False, only includes operational shards + """ + where_clause = "" + if not for_analysis: + where_clause = """ + WHERE s.routing_state = 'STARTED' + AND s.recovery['files']['percent'] = 100.0""" + + query = f""" + SELECT + n.attributes['zone'] as zone, + s.node['name'] as node_name, + CASE WHEN s."primary" = true THEN 'PRIMARY' ELSE 'REPLICA' END as shard_type, + COUNT(*) as shard_count, + SUM(s.size) / 1024.0^3 as total_size_gb, + AVG(s.size) / 1024.0^3 as avg_size_gb + FROM sys.shards s + JOIN sys.nodes n ON s.node['id'] = n.id{where_clause} + GROUP BY n.attributes['zone'], s.node['name'], s."primary" + ORDER BY zone, node_name, shard_type DESC + """ + + result = self.execute_query(query) + + summary = { + 'by_zone': {}, + 'by_node': {}, + 'totals': {'primary': 0, 'replica': 0, 'total_size_gb': 0} + } + + for row in result.get('rows', []): + zone = row[0] or 'unknown' + node_name = row[1] + shard_type = row[2] + shard_count = row[3] + total_size_gb = float(row[4] or 0) + avg_size_gb = float(row[5] or 0) + + # By zone summary + if zone not in summary['by_zone']: + summary['by_zone'][zone] = {'PRIMARY': 0, 'REPLICA': 0, 'total_size_gb': 0} + summary['by_zone'][zone][shard_type] += shard_count + summary['by_zone'][zone]['total_size_gb'] += total_size_gb + + # By node summary + if node_name not in summary['by_node']: + summary['by_node'][node_name] = { + 'zone': zone, + 'PRIMARY': 0, + 'REPLICA': 0, + 'total_size_gb': 0 + } + summary['by_node'][node_name][shard_type] += shard_count + summary['by_node'][node_name]['total_size_gb'] += total_size_gb + + # Overall totals + if shard_type == 'PRIMARY': + summary['totals']['primary'] += shard_count + else: + summary['totals']['replica'] += shard_count + summary['totals']['total_size_gb'] += total_size_gb + + return summary + + def test_connection(self) -> bool: + """Test the connection to CrateDB""" + try: + result = self.execute_query("SELECT 1") + return result.get('rowcount', 0) >= 0 + except Exception: + return False + + def get_cluster_watermarks(self) -> Dict[str, Any]: + """Get cluster disk watermark settings""" + query = """ + SELECT settings['cluster']['routing']['allocation']['disk']['watermark'] + FROM sys.cluster + """ + + try: + result = self.execute_query(query) + if result.get('rows'): + watermarks = result['rows'][0][0] or {} + return { + 'low': watermarks.get('low', 'Not set'), + 'high': watermarks.get('high', 'Not set'), + 'flood_stage': watermarks.get('flood_stage', 'Not set'), + 'enable_for_single_data_node': watermarks.get('enable_for_single_data_node', 'Not set') + } + return {} + except Exception: + return {} + + def get_active_recoveries(self, table_name: Optional[str] = None, + node_name: Optional[str] = None) -> List[Dict[str, Any]]: + """Get shards that are currently in recovery states from sys.allocations""" + + where_conditions = ["current_state != 'STARTED'"] + parameters = [] + + if table_name: + where_conditions.append("table_name = ?") + parameters.append(table_name) + + if node_name: + where_conditions.append("node_id = (SELECT id FROM sys.nodes WHERE name = ?)") + parameters.append(node_name) + + where_clause = f"WHERE {' AND '.join(where_conditions)}" + + query = f""" + SELECT + table_name, + shard_id, + current_state, + explanation, + node_id + FROM sys.allocations + {where_clause} + ORDER BY current_state, table_name, shard_id + """ + + result = self.execute_query(query, parameters) + + allocations = [] + for row in result.get('rows', []): + allocations.append({ + 'schema_name': 'doc', # Default schema since not available in sys.allocations + 'table_name': row[0], + 'shard_id': row[1], + 'current_state': row[2], + 'explanation': row[3], + 'node_id': row[4] + }) + + return allocations + + def get_recovery_details(self, schema_name: str, table_name: str, shard_id: int) -> Optional[Dict[str, Any]]: + """Get detailed recovery information for a specific shard from sys.shards""" + + # Query for shards that are actively recovering (not completed) + query = """ + SELECT + s.table_name, + s.schema_name, + s.id as shard_id, + s.node['name'] as node_name, + s.node['id'] as node_id, + s.routing_state, + s.state, + s.recovery, + s.size, + s."primary", + s.translog_stats['size'] as translog_size + FROM sys.shards s + WHERE s.table_name = ? AND s.id = ? + AND (s.state = 'RECOVERING' OR s.routing_state IN ('INITIALIZING', 'RELOCATING')) + ORDER BY s.schema_name + LIMIT 1 + """ + + result = self.execute_query(query, [table_name, shard_id]) + + if not result.get('rows'): + return None + + row = result['rows'][0] + return { + 'table_name': row[0], + 'schema_name': row[1], + 'shard_id': row[2], + 'node_name': row[3], + 'node_id': row[4], + 'routing_state': row[5], + 'state': row[6], + 'recovery': row[7], + 'size': row[8], + 'primary': row[9], + 'translog_size': row[10] or 0 + } + + def get_all_recovering_shards(self, table_name: Optional[str] = None, + node_name: Optional[str] = None, + include_transitioning: bool = False) -> List[RecoveryInfo]: + """Get comprehensive recovery information by combining sys.allocations and sys.shards data""" + + # Step 1: Get active recoveries from allocations (efficient) + active_allocations = self.get_active_recoveries(table_name, node_name) + + if not active_allocations: + return [] + + recoveries = [] + + # Step 2: Get detailed recovery info for each active recovery + for allocation in active_allocations: + recovery_detail = self.get_recovery_details( + allocation['schema_name'], # This will be 'doc' default + allocation['table_name'], + allocation['shard_id'] + ) + + if recovery_detail and recovery_detail.get('recovery'): + # Update allocation with actual schema from sys.shards + allocation['schema_name'] = recovery_detail['schema_name'] + recovery_info = self._parse_recovery_info(allocation, recovery_detail) + + # Filter out completed recoveries unless include_transitioning is True + if include_transitioning or not self._is_recovery_completed(recovery_info): + recoveries.append(recovery_info) + + # Sort by recovery type, then by progress + return sorted(recoveries, key=lambda r: (r.recovery_type, -r.overall_progress)) + + def _parse_recovery_info(self, allocation: Dict[str, Any], + shard_detail: Dict[str, Any]) -> RecoveryInfo: + """Parse recovery information from allocation and shard data""" + + recovery = shard_detail.get('recovery', {}) + + # Extract recovery progress information + files_info = recovery.get('files', {}) + size_info = recovery.get('size', {}) + + files_percent = float(files_info.get('percent', 0.0)) + bytes_percent = float(size_info.get('percent', 0.0)) + + # Calculate actual progress based on recovered vs used + files_recovered = files_info.get('recovered', 0) + files_used = files_info.get('used', 1) # Avoid division by zero + size_recovered = size_info.get('recovered', 0) + size_used = size_info.get('used', 1) # Avoid division by zero + + # Use actual progress if different from reported percent + actual_files_percent = (files_recovered / files_used * 100.0) if files_used > 0 else files_percent + actual_size_percent = (size_recovered / size_used * 100.0) if size_used > 0 else bytes_percent + + # Use the more conservative (lower) progress value + final_files_percent = min(files_percent, actual_files_percent) + final_bytes_percent = min(bytes_percent, actual_size_percent) + + # Get source node for PEER recoveries + source_node = None + if recovery.get('type') == 'PEER': + source_node = self._find_source_node_for_recovery( + shard_detail['schema_name'], + shard_detail['table_name'], + shard_detail['shard_id'], + shard_detail['node_id'] + ) + + return RecoveryInfo( + schema_name=shard_detail['schema_name'], + table_name=shard_detail['table_name'], + shard_id=shard_detail['shard_id'], + node_name=shard_detail['node_name'], + node_id=shard_detail['node_id'], + recovery_type=recovery.get('type', 'UNKNOWN'), + stage=recovery.get('stage', 'UNKNOWN'), + files_percent=final_files_percent, + bytes_percent=final_bytes_percent, + total_time_ms=recovery.get('total_time', 0), + routing_state=shard_detail['routing_state'], + current_state=allocation['current_state'], + is_primary=shard_detail['primary'], + size_bytes=shard_detail.get('size', 0), + source_node_name=source_node, + translog_size_bytes=shard_detail.get('translog_size', 0) + ) + + def _find_source_node_for_recovery(self, schema_name: str, table_name: str, shard_id: int, target_node_id: str) -> Optional[str]: + """Find source node for PEER recovery by looking for primary or other replicas""" + try: + # First try to find the primary shard of the same table/shard + query = """ + SELECT node['name'] as node_name + FROM sys.shards + WHERE schema_name = ? AND table_name = ? AND id = ? + AND state = 'STARTED' AND node['id'] != ? + AND "primary" = true + LIMIT 1 + """ + + result = self.execute_query(query, [schema_name, table_name, shard_id, target_node_id]) + + if result.get('rows'): + return result['rows'][0][0] + + # If no primary found, look for any started replica + query_replica = """ + SELECT node['name'] as node_name + FROM sys.shards + WHERE schema_name = ? AND table_name = ? AND id = ? + AND state = 'STARTED' AND node['id'] != ? + LIMIT 1 + """ + + result = self.execute_query(query_replica, [schema_name, table_name, shard_id, target_node_id]) + + if result.get('rows'): + return result['rows'][0][0] + + except Exception: + # If query fails, just return None + pass + + return None + + def _is_recovery_completed(self, recovery_info: RecoveryInfo) -> bool: + """Check if a recovery is completed but still transitioning""" + return (recovery_info.stage == 'DONE' and + recovery_info.files_percent >= 100.0 and + recovery_info.bytes_percent >= 100.0) \ No newline at end of file diff --git a/cratedb_toolkit/cli.py b/cratedb_toolkit/cli.py index 4e8e17c2..2410d5ec 100644 --- a/cratedb_toolkit/cli.py +++ b/cratedb_toolkit/cli.py @@ -3,6 +3,7 @@ from cratedb_toolkit.util.cli import boot_click +from .admin.xmover.cli import main as admin_xmover_cli from .adapter.rockset.cli import cli as rockset_cli from .cfr.cli import cli as cfr_cli from .cluster.cli import cli as cloud_cli @@ -27,6 +28,7 @@ def cli(ctx: click.Context, verbose: bool, debug: bool): return boot_click(ctx, verbose, debug) +cli.add_command(admin_xmover_cli, name="xmover") cli.add_command(info_cli, name="info") cli.add_command(cfr_cli, name="cfr") cli.add_command(cloud_cli, name="cluster") diff --git a/doc/admin/index.md b/doc/admin/index.md new file mode 100644 index 00000000..d36c00e1 --- /dev/null +++ b/doc/admin/index.md @@ -0,0 +1,7 @@ +# Administrative Utilities + +```{toctree} +:maxdepth: 1 + +xmover/index +``` diff --git a/doc/admin/xmover/handbook.md b/doc/admin/xmover/handbook.md new file mode 100644 index 00000000..c103c6f2 --- /dev/null +++ b/doc/admin/xmover/handbook.md @@ -0,0 +1,487 @@ +(xmover-handbook)= +# XMover Handbook + +## Installation + +Install using uv (recommended) or pip: +```bash +uv tool install cratedb-toolkit + +# Alternatively use `pip`. +# pip install --user cratedb-toolkit +``` + +Create an `.env` file with your CrateDB connection details: +```bash +CRATE_CONNECTION_STRING=https://your-cluster.cratedb.net:4200 +CRATE_USERNAME=your-username +CRATE_PASSWORD=your-password +CRATE_SSL_VERIFY=true +``` + +## Quick Start + +### Test Connection +```bash +xmover test-connection +``` + +### Analyze Cluster +```bash +# Complete cluster analysis +xmover analyze + +# Analyze specific table +xmover analyze --table my_table +``` + +### Find Movement Candidates +```bash +# Find shards that can be moved (40-60GB by default) +xmover find-candidates + +# Custom size range +xmover find-candidates --min-size 20 --max-size 100 +``` + +### Generate Recommendations +```bash +# Dry run (default) - shows what would be recommended +xmover recommend + +# Generate actual SQL commands +xmover recommend --execute + +# Prioritize space over zone balancing +xmover recommend --prioritize-space +``` + +### Zone Analysis +```bash +# Check zone balance +xmover check-balance + +# Detailed zone analysis with shard-level details +xmover zone-analysis --show-shards +``` + +### Advanced Troubleshooting +```bash +# Validate specific moves before execution +xmover validate-move SCHEMA.TABLE SHARD_ID FROM_NODE TO_NODE + +# Explain CrateDB error messages +xmover explain-error "your error message here" +``` + +## Commands Reference + +### `analyze` +Analyzes current shard distribution across nodes and zones. + +**Options:** +- `--table, -t`: Analyze specific table only + +**Example:** +```bash +xmover analyze --table events +``` + +### `find-candidates` +Finds shards suitable for movement based on size and health criteria. + +**Options:** +- `--table, -t`: Find candidates in specific table only +- `--min-size`: Minimum shard size in GB (default: 40) +- `--max-size`: Maximum shard size in GB (default: 60) +- `--node`: Only show candidates from this specific source node (e.g., data-hot-4) + +**Examples:** +```bash +# Find candidates in size range for specific table +xmover find-candidates --min-size 20 --max-size 50 --table logs + +# Find candidates on a specific node +xmover find-candidates --min-size 30 --max-size 60 --node data-hot-4 +``` + +### `recommend` +Generates intelligent shard movement recommendations for cluster rebalancing. + +**Options:** +- `--table, -t`: Generate recommendations for specific table only +- `--min-size`: Minimum shard size in GB (default: 40) +- `--max-size`: Maximum shard size in GB (default: 60) +- `--zone-tolerance`: Zone balance tolerance percentage (default: 10) +- `--min-free-space`: Minimum free space required on target nodes in GB (default: 100) +- `--max-moves`: Maximum number of move recommendations (default: 10) +- `--max-disk-usage`: Maximum disk usage percentage for target nodes (default: 85) +- `--validate/--no-validate`: Validate move safety (default: True) +- `--prioritize-space/--prioritize-zones`: Prioritize available space over zone balancing (default: False) +- `--dry-run/--execute`: Show what would be done without generating SQL commands (default: True) +- `--node`: Only recommend moves from this specific source node (e.g., data-hot-4) + +**Examples:** +```bash +# Dry run with zone balancing priority +xmover recommend --prioritize-zones + +# Generate SQL for space optimization +xmover recommend --prioritize-space --execute + +# Focus on specific table with custom parameters +xmover recommend --table events --min-size 10 --max-size 30 --execute + +# Target space relief for a specific node +xmover recommend --prioritize-space --min-size 30 --max-size 60 --node data-hot-4 + +# Allow higher disk usage for urgent moves +xmover recommend --prioritize-space --max-disk-usage 90 +``` + +### `zone-analysis` +Provides detailed analysis of zone distribution and potential conflicts. + +**Options:** +- `--table, -t`: Analyze zones for specific table only +- `--show-shards/--no-show-shards`: Show individual shard details (default: False) + +**Example:** +```bash +xmover zone-analysis --show-shards --table critical_data +``` + +### `check-balance` +Checks zone balance for shards with configurable tolerance. + +**Options:** +- `--table, -t`: Check balance for specific table only +- `--tolerance`: Zone balance tolerance percentage (default: 10) + +**Example:** +```bash +xmover check-balance --tolerance 15 +``` + + + +### `validate-move` +Validates a specific shard move before execution to prevent errors. + +**Arguments:** +- `SCHEMA_TABLE`: Schema and table name (format: schema.table) +- `SHARD_ID`: Shard ID to move +- `FROM_NODE`: Source node name +- `TO_NODE`: Target node name + +**Examples:** +```bash +# Standard validation +xmover validate-move CUROV.maddoxxxS 4 data-hot-1 data-hot-3 + +# Allow higher disk usage for urgent moves +xmover validate-move CUROV.tendedero 4 data-hot-1 data-hot-3 --max-disk-usage 90 +``` + +### `explain-error` +Explains CrateDB allocation error messages and provides troubleshooting guidance. + +**Arguments:** +- `ERROR_MESSAGE`: The CrateDB error message to analyze (optional - can be provided interactively) + +**Examples:** +```bash +# Interactive mode +xmover explain-error + +# Direct analysis +xmover explain-error "NO(a copy of this shard is already allocated to this node)" +``` + +### `monitor-recovery` +Monitors active shard recovery operations on the cluster. + +**Options:** +- `--table, -t`: Monitor recovery for specific table only +- `--node, -n`: Monitor recovery on specific node only +- `--watch, -w`: Continuously monitor (refresh every 10s) +- `--refresh-interval`: Refresh interval for watch mode in seconds (default: 10) +- `--recovery-type`: Filter by recovery type - PEER, DISK, or all (default: all) +- `--include-transitioning`: Include recently completed recoveries (DONE stage) + +**Examples:** +```bash +# Check current recovery status +xmover monitor-recovery + +# Monitor specific table recoveries +xmover monitor-recovery --table PartioffD + +# Continuous monitoring with custom refresh rate +xmover monitor-recovery --watch --refresh-interval 5 + +# Monitor only PEER recoveries on specific node +xmover monitor-recovery --node data-hot-1 --recovery-type PEER + +# Include completed recoveries still transitioning +xmover monitor-recovery --watch --include-transitioning +``` + +**Recovery Types:** +- **PEER**: Copying shard data from another node (replication/relocation) +- **DISK**: Rebuilding shard from local data (after restart/disk issues) + +### `test-connection` +Tests the connection to CrateDB and displays basic cluster information. + +## Operation Modes + +### Analysis vs Operational Views + +XMover provides two distinct views of your cluster: + +1. **Analysis View** (`analyze`, `zone-analysis`): Includes ALL shards regardless of state for complete cluster visibility +2. **Operational View** (`find-candidates`, `recommend`): Only includes healthy shards (STARTED + 100% recovered) for safe operations + +### Prioritization Modes + +When generating recommendations, you can choose between two prioritization strategies: + +1. **Zone Balancing Priority** (default): Focuses on achieving optimal zone distribution first, then considers available space +2. **Space Priority**: Prioritizes moving shards to nodes with more available space, regardless of zone balance + +### Safety Features + +- **Zone Conflict Detection**: Prevents moves that would place multiple copies of the same shard in the same zone +- **Capacity Validation**: Ensures target nodes have sufficient free space +- **Health Checks**: Only operates on healthy shards (STARTED routing state + 100% recovery) +- **SQL Quoting**: Properly quotes schema and table names in generated SQL commands + +## Example Workflows + +### Regular Cluster Maintenance + +1. Analyze current state: +```bash +xmover analyze +``` + +2. Check for zone imbalances: +```bash +xmover check-balance +``` + +3. Generate and review recommendations: +```bash +xmover recommend --dry-run +``` + +4. Execute safe moves: +```bash +xmover recommend --execute +``` + +### Targeted Node Relief + +When a specific node is running low on space: + +1. Check which node needs relief: +```bash +xmover analyze +``` + +2. Generate recommendations for that specific node: +```bash +xmover recommend --prioritize-space --node data-hot-4 --dry-run +``` + +3. Execute the moves: +```bash +xmover recommend --prioritize-space --node data-hot-4 --execute +``` + +### Monitoring Shard Recovery Operations + +After executing shard moves, monitor the recovery progress: + +1. Execute moves and monitor recovery: +```bash +# Execute moves +xmover recommend --node data-hot-1 --execute + +# Monitor the resulting recoveries +xmover monitor-recovery --watch +``` + +2. Monitor specific table or node recovery: +```bash +# Monitor specific table +xmover monitor-recovery --table shipmentFormFieldData --watch + +# Monitor specific node +xmover monitor-recovery --node data-hot-4 --watch + +# Monitor including completed recoveries +xmover monitor-recovery --watch --include-transitioning +``` + +3. Check recovery after node maintenance: +```bash +# After bringing a node back online +xmover monitor-recovery --node data-hot-3 --recovery-type DISK +``` + +### Manual Shard Movement + +1. Validate the move first: +```bash +xmover validate-move SCHEMA.TABLE SHARD_ID FROM_NODE TO_NODE +``` + +2. Generate safe recommendations: +```bash +xmover recommend --prioritize-space --execute +``` + +3. Monitor shard health after moves + +### Troubleshooting Zone Conflicts + +1. Identify conflicts: +```bash +xmover zone-analysis --show-shards +``` + +2. Generate targeted fixes: +```bash +xmover recommend --prioritize-zones --execute +``` + +## Configuration + +### Environment Variables + +- `CRATE_CONNECTION_STRING`: CrateDB HTTP endpoint (required) +- `CRATE_USERNAME`: Username for authentication (optional) +- `CRATE_PASSWORD`: Password for authentication (optional) +- `CRATE_SSL_VERIFY`: Enable SSL certificate verification (default: true) + +### Connection String Format + +``` +https://hostname:port +``` + +The tool automatically appends `/_sql` to the endpoint. + +## Safety Considerations + +⚠️ **Important Safety Notes:** + +1. **Always test in non-production environments first** +2. **Monitor shard health after each move before proceeding with additional moves** +3. **Ensure adequate cluster capacity before decommissioning nodes** +4. **Verify zone distribution after rebalancing operations** +5. **Keep backups current before performing large-scale moves** + +## Troubleshooting + +XMover provides comprehensive troubleshooting tools to help diagnose and resolve shard movement issues. + +### Quick Diagnosis Commands + +```bash +# Validate a specific move before execution +xmover validate-move SCHEMA.TABLE SHARD_ID FROM_NODE TO_NODE + +# Explain CrateDB error messages +xmover explain-error "your error message here" + +# Check zone distribution for conflicts +xmover zone-analysis --show-shards + +# Verify overall cluster health +xmover analyze +``` + +### Common Issues and Solutions + +1. **Zone Conflicts** + ``` + Error: "NO(a copy of this shard is already allocated to this node)" + ``` + - **Cause**: Target node already has a copy of the shard + - **Solution**: Use `xmover zone-analysis --show-shards` to find alternative targets + - **Prevention**: Always use `xmover validate-move` before executing moves + +2. **Zone Allocation Limits** + ``` + Error: "too many copies of the shard allocated to nodes with attribute [zone]" + ``` + - **Cause**: CrateDB's zone awareness prevents too many copies in same zone + - **Solution**: Move shard to a different availability zone + - **Prevention**: Use `xmover recommend` which respects zone constraints + +3. **Insufficient Space** + ``` + Error: "not enough disk space" + ``` + - **Cause**: Target node lacks sufficient free space + - **Solution**: Choose node with more capacity or free up space + - **Check**: `xmover analyze` to see available space per node + +4. **High Disk Usage Blocking Moves** + ``` + Error: "Target node disk usage too high (85.3%)" + ``` + - **Cause**: Target node exceeds default 85% disk usage threshold + - **Solution**: Use `--max-disk-usage` to allow higher usage for urgent moves + - **Example**: `xmover recommend --max-disk-usage 90 --prioritize-space` + +5. **No Recommendations Generated** + - **Cause**: Cluster may already be well balanced + - **Solution**: Adjust size filters or check `xmover check-balance` + - **Try**: `--prioritize-space` mode for capacity-based moves + +### Error Message Decoder + +Use the built-in error decoder for complex CrateDB messages: + +```bash +# Interactive mode - paste your error message +xmover explain-error + +# Direct analysis +xmover explain-error "NO(a copy of this shard is already allocated to this node)" +``` + +### Configurable Safety Thresholds + +XMover uses configurable safety thresholds to prevent risky moves: + +**Disk Usage Threshold (default: 85%)** +```bash +# Allow moves to nodes with higher disk usage +xmover recommend --max-disk-usage 90 --prioritize-space + +# For urgent space relief +xmover validate-move SCHEMA.TABLE SHARD_ID FROM TO --max-disk-usage 95 +``` + +**When to Adjust Thresholds:** +- **Emergency situations**: Increase to 90-95% for critical space relief +- **Conservative operations**: Decrease to 75-80% for safer moves +- **Staging environments**: Can be more aggressive (90%+) +- **Production**: Keep conservative (80-85%) + +### Advanced Troubleshooting + +For detailed troubleshooting procedures, see {ref}`xmover-troubleshooting` which covers: +- Step-by-step diagnostic procedures +- Emergency recovery procedures +- Best practices for safe operations +- Complete error reference guide + +### Debug Information + +All commands provide detailed safety validation messages and explanations for any issues detected. diff --git a/doc/admin/xmover/index.md b/doc/admin/xmover/index.md new file mode 100644 index 00000000..7b522310 --- /dev/null +++ b/doc/admin/xmover/index.md @@ -0,0 +1,29 @@ +# XMover + +:::{div} sd-text-muted +CrateDB Shard Analyzer and Movement Tool. +::: + +A comprehensive looking-glass utility for analyzing CrateDB shard +distribution across nodes and availability zones. It generates safe +SQL commands for shard rebalancing and node decommissioning. + +## Features + +- **Cluster Analysis**: Complete overview of shard distribution across nodes and zones +- **Shard Movement Recommendations**: Intelligent suggestions for rebalancing with safety validation +- **Recovery Monitoring**: Track ongoing shard recovery operations with progress details +- **Zone Conflict Detection**: Prevents moves that would violate CrateDB's zone awareness +- **Node Decommissioning**: Plan safe node removal with automated shard relocation +- **Dry Run Mode**: Test recommendations without generating actual SQL commands +- **Safety Validation**: Comprehensive checks to ensure data availability during moves + +## Documentation + +```{toctree} +:maxdepth: 1 + +Handbook +Troubleshooting +Query gallery +``` diff --git a/doc/admin/xmover/queries.md b/doc/admin/xmover/queries.md new file mode 100644 index 00000000..4600038c --- /dev/null +++ b/doc/admin/xmover/queries.md @@ -0,0 +1,212 @@ +(xmover-queries)= +# XMover Query Gallery + +## Shard Distribution over Nodes + +```sql +select node['name'], sum(size) / 1024^3, count(id) from sys.shards group by 1 order by 1 asc; ++--------------+-----------------------------+-----------+ +| node['name'] | (sum(size) / 1.073741824E9) | count(id) | ++--------------+-----------------------------+-----------+ +| data-hot-0 | 1862.5866614403203 | 680 | +| data-hot-1 | 1866.0331328986213 | 684 | +| data-hot-2 | 1856.6581886671484 | 1043 | +| data-hot-3 | 1208.932889252901 | 477 | +| data-hot-4 | 1861.7727940855548 | 674 | +| data-hot-5 | 1863.4315695902333 | 744 | +| data-hot-6 | 1851.3522544233128 | 948 | +| NULL | 0.0 | 35 | ++--------------+-----------------------------+-----------+ +SELECT 8 rows in set (0.061 sec) +``` +## Shard Distribution PRIMARY/REPLICAS over nodes + +```sql + +select node['name'], primary, sum(size) / 1024^3, count(id) from sys.shards group by 1,2 order by 1 asc; ++--------------+---------+-----------------------------+-----------+ +| node['name'] | primary | (sum(size) / 1.073741824E9) | count(id) | ++--------------+---------+-----------------------------+-----------+ +| data-hot-0 | TRUE | 1459.3267894154415 | 447 | +| data-hot-0 | FALSE | 403.25987202487886 | 233 | +| data-hot-1 | TRUE | 1209.6781993638724 | 374 | +| data-hot-1 | FALSE | 656.3549335347489 | 310 | +| data-hot-2 | TRUE | 1624.9012612393126 | 995 | +| data-hot-2 | FALSE | 231.5014410642907 | 48 | +| data-hot-3 | TRUE | 6.339549297466874 | 58 | +| data-hot-3 | FALSE | 1202.486775631085 | 419 | +| data-hot-4 | FALSE | 838.5498185381293 | 225 | +| data-hot-4 | TRUE | 1023.1511942362413 | 449 | +| data-hot-5 | FALSE | 1002.365406149067 | 422 | +| data-hot-5 | TRUE | 860.9174101138487 | 322 | +| data-hot-6 | FALSE | 1850.3959310995415 | 940 | +| data-hot-6 | TRUE | 0.9159421799704432 | 8 | +| NULL | FALSE | 0.0 | 35 | ++--------------+---------+-----------------------------+-----------+ + +``` + +## Nodes available Space + +```sql ++------------+--------------------+-----------------------------------------------+ +| name | attributes['zone'] | (fs[1]['disks']['available'] / 1.073741824E9) | ++------------+--------------------+-----------------------------------------------+ +| data-hot-5 | us-west-2a | 142.3342628479004 | +| data-hot-0 | us-west-2a | 142.03089141845703 | +| data-hot-6 | us-west-2b | 159.68728256225586 | +| data-hot-3 | us-west-2b | 798.8147850036621 | +| data-hot-2 | us-west-2b | 156.79160690307617 | +| data-hot-1 | us-west-2c | 145.73613739013672 | +| data-hot-4 | us-west-2c | 148.39511108398438 | ++------------+--------------------+-----------------------------------------------+ +``` + +## List biggest SHARDS on a particular Nodes + +```sql +select node['name'], table_name, schema_name, id, sum(size) / 1024^3 from sys.shards + where node['name'] = 'data-hot-2' + AND routing_state = 'STARTED' + AND recovery['files']['percent'] = 0 + group by 1,2,3,4 order by 5 desc limit 8; ++--------------+-----------------------+-------------+----+-----------------------------+ +| node['name'] | table_name | schema_name | id | (sum(size) / 1.073741824E9) | ++--------------+-----------------------+-------------+----+-----------------------------+ +| data-hot-2 | bottleFieldData | curvo | 5 | 135.568662205711 | +| data-hot-2 | bottleFieldData | curvo | 8 | 134.813782049343 | +| data-hot-2 | bottleFieldData | curvo | 3 | 133.43549298401922 | +| data-hot-2 | bottleFieldData | curvo | 11 | 130.10448653809726 | +| data-hot-2 | turtleFieldData | curvo | 31 | 54.642812703736126 | +| data-hot-2 | turtleFieldData | curvo | 29 | 54.06101848650724 | +| data-hot-2 | turtleFieldData | curvo | 5 | 53.96749582327902 | +| data-hot-2 | turtleFieldData | curvo | 21 | 53.72262619435787 | ++--------------+-----------------------+-------------+----+-----------------------------+ +SELECT 8 rows in set (0.062 sec) +``` + +## Move REROUTE +```sql + +alter table "curvo"."bottlefieldData" reroute move shard 21 from 'data-hot-2' to 'data-hot-3'; +``` +--- + +```sql + +WITH shard_summary AS ( + SELECT + node['name'] AS node_name, + table_name, + schema_name, + CASE + WHEN "primary" = true THEN 'PRIMARY' + ELSE 'REPLICA' + END AS shard_type, + COUNT(*) AS shard_count, + SUM(size) / 1024^3 AS total_size_gb + FROM sys.shards + WHERE table_name = 'orderffD' + AND routing_state = 'STARTED' + AND recovery['files']['percent'] = 0 + GROUP BY node['name'], table_name, schema_name, "primary" +) +SELECT + node_name, + table_name, + schema_name, + shard_type, + shard_count, + ROUND(total_size_gb, 2) AS total_size_gb, + ROUND(total_size_gb / shard_count, 2) AS avg_shard_size_gb +FROM shard_summary +ORDER BY node_name, shard_type DESC, total_size_gb DESC; +``` + +```sql +-- Comprehensive shard distribution showing both node and zone details +SELECT + n.attributes['zone'] AS zone, + s.node['name'] AS node_name, + s.table_name, + s.schema_name, + CASE + WHEN s."primary" = true THEN 'PRIMARY' + ELSE 'REPLICA' + END AS shard_type, + s.id AS shard_id, + s.size / 1024^3 AS shard_size_gb, + s.num_docs, + s.state +FROM sys.shards s +JOIN sys.nodes n ON s.node['id'] = n.id +WHERE s.table_name = 'your_table_name' -- Replace with your specific table name + AND s.routing_state = 'STARTED' + AND s.recovery['files']['percent'] = 0 +ORDER BY + n.attributes['zone'], + s.node['name'], + s."primary" DESC, -- Primary shards first + s.id; + +-- Summary by zone and shard type +SELECT + n.attributes['zone'] AS zone, + CASE + WHEN s."primary" = true THEN 'PRIMARY' + ELSE 'REPLICA' + END AS shard_type, + COUNT(*) AS shard_count, + COUNT(DISTINCT s.node['name']) AS nodes_with_shards, + ROUND(SUM(s.size) / 1024^3, 2) AS total_size_gb, + ROUND(AVG(s.size) / 1024^3, 3) AS avg_shard_size_gb, + SUM(s.num_docs) AS total_documents +FROM sys.shards s +JOIN sys.nodes n ON s.node['id'] = n.id +WHERE s.table_name = 'orderffD' -- Replace with your specific table name + AND s.routing_state = 'STARTED' + AND s.recovery['files']['percent'] = 0 +GROUP BY n.attributes['zone'], s."primary" +ORDER BY zone, shard_type DESC; + +``` + +## Relocation + +```sql +SELECT + table_name, + shard_id, + current_state, + explanation, + node_id + FROM sys.allocations + WHERE current_state != 'STARTED' and table_name = 'dispatchio' and shard_id = 19 + ORDER BY current_state, table_name, shard_id; + ++-----------------------+----------+---------------+-------------+------------------------+ +| table_name | shard_id | current_state | explanation | node_id | ++-----------------------+----------+---------------+-------------+------------------------+ +| dispatchio | 19 | RELOCATING | NULL | ZH6fBanGSjanGqeSh-sw0A | ++-----------------------+----------+---------------+-------------+------------------------+ +``` + +```sql +SELECT + COUNT(*) as recovering_shards + FROM sys.shards + WHERE state = 'RECOVERING' OR routing_state IN ('INITIALIZING', 'RELOCATING'); + +``` + +```sql +SELECT + table_name, + shard_id, + current_state, + explanation, + node_id + FROM sys.allocations + WHERE current_state != 'STARTED' and table_name = 'dispatchio' and shard_id = 19 + ORDER BY current_state, table_name, shard_id; +``` diff --git a/doc/admin/xmover/troubleshooting.md b/doc/admin/xmover/troubleshooting.md new file mode 100644 index 00000000..14567586 --- /dev/null +++ b/doc/admin/xmover/troubleshooting.md @@ -0,0 +1,424 @@ +(xmover-troubleshooting)= +# Troubleshooting CrateDB using XMover + +This guide helps you diagnose and resolve common issues when using XMover for CrateDB shard management. + +## Quick Diagnosis Commands + +Before troubleshooting, run these commands to understand your cluster state: + +```bash +# Check overall cluster health +xmover analyze + +# Check zone distribution for conflicts +xmover zone-analysis --show-shards + +# Validate a specific move before execution +xmover validate-move SCHEMA.TABLE SHARD_ID FROM_NODE TO_NODE + +# Explain CrateDB error messages +xmover explain-error "your error message here" +``` + +## Common Issues and Solutions + +### 1. Zone Conflicts + +#### Symptoms +- Error: `NO(a copy of this shard is already allocated to this node)` +- Error: `NO(there are too many copies of the shard allocated to nodes with attribute [zone])` +- Recommendations show zone conflicts in safety validation + +#### Root Causes +- Target node already has a copy of the shard (primary or replica) +- Target zone already has copies, violating CrateDB's zone awareness +- Incorrect understanding of current shard distribution + +#### Solutions + +**Step 1: Analyze Current Distribution** +```bash +# See exactly where shard copies are located +xmover zone-analysis --show-shards --table YOUR_TABLE + +# Check overall zone balance +xmover check-balance +``` + +**Step 2: Find Alternative Targets** +```bash +# Find nodes with available capacity in different zones +xmover analyze + +# Get movement candidates with size filters +xmover find-candidates --min-size 20 --max-size 30 +``` + +**Step 3: Validate Before Moving** +```bash +# Always validate moves before execution +xmover validate-move SCHEMA.TABLE SHARD_ID FROM_NODE TO_NODE +``` + +#### Prevention +- Always use `xmover recommend` instead of manual moves +- Enable dry-run mode by default: `xmover recommend --dry-run` +- Check zone distribution before planning moves + +### 2. Insufficient Space Issues + +#### Symptoms +- Error: `not enough disk space` +- Safety validation fails with space warnings +- High disk usage percentages in cluster analysis + +#### Root Causes +- Target node doesn't have enough free space for the shard +- High disk usage on target nodes (>85%) +- Insufficient buffer space for safe operations + +#### Solutions + +**Step 1: Check Available Space** +```bash +# Review node capacity and usage +xmover analyze + +# Look for nodes with more available space +xmover find-candidates --min-size 0 --max-size 100 +``` + +**Step 2: Adjust Parameters** +```bash +# Increase minimum free space requirement +xmover recommend --min-free-space 200 + +# Focus on smaller shards +xmover recommend --max-size 50 +``` + +**Step 3: Free Up Space** +- Delete old snapshots and unused data +- Move other shards away from constrained nodes +- Consider adding nodes to the cluster + +#### Prevention +- Monitor disk usage regularly with `xmover analyze` +- Set conservative `--min-free-space` values (default: 100GB) +- Plan capacity expansion before reaching 80% disk usage + +### 3. Node Performance Issues + +#### Symptoms +- Error: `shard recovery limit` +- High heap usage warnings +- Slow shard movement operations + +#### Root Causes +- Too many concurrent shard movements +- High heap usage on target nodes (>80%) +- Resource contention during moves + +#### Solutions + +**Step 1: Check Node Health** +```bash +# Review heap and disk usage +xmover analyze + +# Check for overloaded nodes +xmover check-balance +``` + +**Step 2: Reduce Concurrent Operations** +```bash +# Move fewer shards at once +xmover recommend --max-moves 3 + +# Wait between moves for recovery completion +# Monitor with CrateDB Admin UI +``` + +**Step 3: Target Less Loaded Nodes** +```bash +# Prioritize nodes with better resources +xmover recommend --prioritize-space +``` + +#### Prevention +- Move shards gradually (5-10 at a time) +- Monitor heap usage and wait for recovery completion +- Avoid moves during high-traffic periods + +### 4. Zone Imbalance Issues + +#### Symptoms +- `check-balance` shows zones marked as "Over" or "Under" +- Zone distribution is uneven +- Some zones have significantly more shards + +#### Root Causes +- Historical data distribution patterns +- Node additions/removals without rebalancing +- Tables created with poor initial distribution + +#### Solutions + +**Step 1: Assess Imbalance** +```bash +# Check current zone balance +xmover check-balance --tolerance 15 + +# Get detailed zone analysis +xmover zone-analysis +``` + +**Step 2: Generate Rebalancing Plan** +```bash +# Prioritize zone balancing +xmover recommend --prioritize-zones --dry-run + +# Review recommendations carefully +xmover recommend --prioritize-zones --max-moves 10 +``` + +**Step 3: Execute Gradually** +```bash +# Execute in small batches +xmover recommend --prioritize-zones --max-moves 5 --execute + +# Monitor progress and repeat +``` + +#### Prevention +- Run regular balance checks: `xmover check-balance` +- Use zone-aware table creation with proper shard allocation +- Plan rebalancing during maintenance windows + +### 5. Connection and Authentication Issues + +#### Symptoms +- "Connection failed" errors +- Authentication failures +- SSL/TLS errors + +#### Root Causes +- Incorrect connection string in `.env` +- Wrong credentials +- Network connectivity issues +- SSL certificate problems + +#### Solutions + +**Step 1: Verify Connection** +```bash +# Test basic connectivity +xmover test-connection +``` + +**Step 2: Check Configuration** +```bash +# Verify .env file contents +cat .env + +# Example correct format: +CRATE_CONNECTION_STRING=https://cluster.cratedb.net:4200 +CRATE_USERNAME=admin +CRATE_PASSWORD=your-password +CRATE_SSL_VERIFY=true +``` + +**Step 3: Test Network Access** +```bash +# Test HTTP connectivity +curl -u username:password https://your-cluster:4200/_sql -d '{"stmt":"SELECT 1"}' +``` + +#### Prevention +- Use `.env.example` as a template +- Verify credentials with CrateDB admin +- Test connectivity from deployment environment + +## Error Message Decoder + +### CrateDB Allocation Errors + +Use `xmover explain-error` to decode complex CrateDB error messages: + +```bash +# Interactive mode +xmover explain-error + +# Direct analysis +xmover explain-error "your error message here" +``` + +### Common Error Patterns + +| Error Pattern | Meaning | Quick Fix | +|---------------|---------|-----------| +| `copy of this shard is already allocated` | Node already has shard | Choose different target node | +| `too many copies...with attribute [zone]` | Zone limit exceeded | Move to different zone | +| `not enough disk space` | Insufficient space | Free space or choose different node | +| `shard recovery limit` | Too many concurrent moves | Wait and retry with fewer moves | +| `allocation is disabled` | Cluster allocation disabled | Re-enable allocation settings | + +## Best Practices for Safe Operations + +### Pre-Move Checklist + +1. **Analyze cluster state** + ```bash + xmover analyze + ``` + +2. **Check zone distribution** + ```bash + xmover zone-analysis + ``` + +3. **Generate recommendations** + ```bash + xmover recommend --dry-run + ``` + +4. **Validate specific moves** + ```bash + xmover validate-move SCHEMA.TABLE SHARD_ID FROM TO + ``` + +5. **Execute gradually** + ```bash + xmover recommend --max-moves 5 --execute + ``` + +### During Operations + +1. **Monitor shard health** + - Check CrateDB Admin UI for recovery progress + - Watch for failed or stuck shards + - Verify routing state changes to STARTED + +2. **Track resource usage** + - Monitor disk and heap usage on target nodes + - Watch for network saturation during moves + - Check cluster performance metrics + +3. **Maintain documentation** + - Record moves performed and reasons + - Note any issues encountered + - Document lessons learned + +### Post-Move Verification + +1. **Verify shard health** + ```sql + SELECT table_name, id, "primary", node['name'], routing_state + FROM sys.shards + WHERE table_name = 'your_table' AND routing_state != 'STARTED'; + ``` + +2. **Check zone balance** + ```bash + xmover check-balance + ``` + +3. **Monitor cluster performance** + - Query response times + - Resource utilization + - Error rates + +## Emergency Procedures + +### Stuck Shard Recovery + +If a shard gets stuck during movement: + +1. **Check shard status** + ```sql + SELECT * FROM sys.shards WHERE routing_state != 'STARTED'; + ``` + +2. **Cancel problematic moves** + ```sql + ALTER TABLE "schema"."table" REROUTE CANCEL SHARD ON ''; + ``` + +3. **Retry allocation** + ```sql + ALTER TABLE "schema"."table" REROUTE RETRY FAILED; + ``` + +### Cluster Health Issues + +If moves cause cluster problems: + +1. **Disable allocation temporarily** + ```text + PUT /_cluster/settings + { + "persistent": { + "cluster.routing.allocation.enable": "primaries" + } + } + ``` + +2. **Wait for stabilization** + - Monitor cluster health + - Check node resource usage + - Verify no failed shards + +3. **Re-enable allocation** + ```text + PUT /_cluster/settings + { + "persistent": { + "cluster.routing.allocation.enable": "all" + } + } + ``` + +## Getting Help + +### Built-in Help + +```bash +# Command help +xmover --help +xmover COMMAND --help + +# Error explanation +xmover explain-error + +# Move validation +xmover validate-move SCHEMA.TABLE SHARD_ID FROM TO +``` + +### Additional Resources + +- **CrateDB Documentation**: https://crate.io/docs/ +- **Shard Allocation Guide**: https://crate.io/docs/crate/reference/en/latest/admin/system-information.html +- **Cluster Settings**: https://crate.io/docs/crate/reference/en/latest/config/cluster.html + +### Reporting Issues + +When reporting issues, include: + +1. **XMover version and command used** +2. **Complete error message** +3. **Cluster information** (`xmover analyze` output) +4. **Zone analysis** (`xmover zone-analysis` output) +5. **CrateDB version and configuration** + +### Support Checklist + +Before contacting support: + +- [ ] Tried `xmover validate-move` for the specific operation +- [ ] Checked zone distribution with `xmover zone-analysis` +- [ ] Reviewed cluster health with `xmover analyze` +- [ ] Used `xmover explain-error` to decode error messages +- [ ] Verified connection and authentication with `xmover test-connection` +- [ ] Read through this troubleshooting guide +- [ ] Checked CrateDB documentation for allocation settings diff --git a/doc/index.md b/doc/index.md index 56b849ec..17c55514 100644 --- a/doc/index.md +++ b/doc/index.md @@ -30,6 +30,7 @@ changes :caption: Diagnostics :hidden: +admin/index Cluster Information Cluster Flight Recorder (CFR) ``` diff --git a/pyproject.toml b/pyproject.toml index ca6c86cc..c12d4c32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,7 @@ dependencies = [ "python-slugify<9", "pyyaml<7", "requests>=2.28,<3", + "rich<14", "sqlalchemy-cratedb>=0.41.0", "sqlparse<0.6", "tqdm<5", @@ -263,6 +264,7 @@ scripts.cratedb-retention = "cratedb_toolkit.retention.cli:cli" scripts.cratedb-toolkit = "cratedb_toolkit.cli:cli" scripts.ctk = "cratedb_toolkit.cli:cli" scripts.migr8 = "cratedb_toolkit.io.mongodb.cli:main" +scripts.xmover = "cratedb_toolkit.admin.xmover.cli:main" entry-points.pytest11.cratedb_service = "cratedb_toolkit.testing.pytest" [tool.setuptools.packages.find] From 70a3bf6f7e776887bed0070c6b19419640caf9b2 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 00:50:43 +0200 Subject: [PATCH 02/18] Admin/XMover: Code formatting and linting --- cratedb_toolkit/admin/xmover/__init__.py | 2 +- cratedb_toolkit/admin/xmover/analyzer.py | 524 +++++++++--------- cratedb_toolkit/admin/xmover/cli.py | 667 +++++++++++------------ cratedb_toolkit/admin/xmover/database.py | 495 ++++++++--------- cratedb_toolkit/cli.py | 2 +- pyproject.toml | 9 +- 6 files changed, 836 insertions(+), 863 deletions(-) diff --git a/cratedb_toolkit/admin/xmover/__init__.py b/cratedb_toolkit/admin/xmover/__init__.py index b941f602..92e9ee84 100644 --- a/cratedb_toolkit/admin/xmover/__init__.py +++ b/cratedb_toolkit/admin/xmover/__init__.py @@ -7,4 +7,4 @@ __version__ = "0.1.0" __author__ = "CrateDB Tools" -__description__ = "CrateDB shard analyzer and movement tool" \ No newline at end of file +__description__ = "CrateDB shard analyzer and movement tool" diff --git a/cratedb_toolkit/admin/xmover/analyzer.py b/cratedb_toolkit/admin/xmover/analyzer.py index 75af9090..36d43618 100644 --- a/cratedb_toolkit/admin/xmover/analyzer.py +++ b/cratedb_toolkit/admin/xmover/analyzer.py @@ -2,17 +2,18 @@ Shard analysis and rebalancing logic for CrateDB """ -from typing import Dict, List, Optional, Set, Any, Tuple -from dataclasses import dataclass -from collections import defaultdict import math +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Set, Tuple -from .database import CrateDBClient, NodeInfo, ShardInfo, RecoveryInfo +from .database import CrateDBClient, NodeInfo, RecoveryInfo, ShardInfo @dataclass class MoveRecommendation: """Recommendation for moving a shard""" + table_name: str schema_name: str shard_id: int @@ -26,9 +27,11 @@ class MoveRecommendation: def to_sql(self) -> str: """Generate the SQL command for this move""" - return (f'ALTER TABLE "{self.schema_name}"."{self.table_name}" ' - f"REROUTE MOVE SHARD {self.shard_id} " - f"FROM '{self.from_node}' TO '{self.to_node}';") + return ( + f'ALTER TABLE "{self.schema_name}"."{self.table_name}" ' + f"REROUTE MOVE SHARD {self.shard_id} " + f"FROM '{self.from_node}' TO '{self.to_node}';" + ) @property def safety_score(self) -> float: @@ -50,6 +53,7 @@ def safety_score(self) -> float: @dataclass class DistributionStats: """Statistics about shard distribution""" + total_shards: int total_size_gb: float zones: Dict[str, int] @@ -65,14 +69,14 @@ def __init__(self, client: CrateDBClient): self.client = client self.nodes: List[NodeInfo] = [] self.shards: List[ShardInfo] = [] - + # Initialize session-based caches for performance self._zone_conflict_cache = {} self._node_lookup_cache = {} self._target_nodes_cache = {} self._cache_hits = 0 self._cache_misses = 0 - + self._refresh_data() def _refresh_data(self): @@ -112,7 +116,7 @@ def analyze_distribution(self, table_name: Optional[str] = None) -> Distribution zones=dict(zone_counts), nodes=dict(node_counts), zone_balance_score=zone_balance_score, - node_balance_score=node_balance_score + node_balance_score=node_balance_score, ) def _calculate_balance_score(self, counts: List[int]) -> float: @@ -134,10 +138,9 @@ def _calculate_balance_score(self, counts: List[int]) -> float: score = max(0, 100 * math.exp(-cv)) return round(score, 1) - def find_moveable_shards(self, - min_size_gb: float = 40.0, - max_size_gb: float = 60.0, - table_name: Optional[str] = None) -> List[ShardInfo]: + def find_moveable_shards( + self, min_size_gb: float = 40.0, max_size_gb: float = 60.0, table_name: Optional[str] = None + ) -> List[ShardInfo]: """Find shards that are candidates for moving based on size Only returns healthy shards that are safe to move. @@ -148,20 +151,19 @@ def find_moveable_shards(self, table_name=table_name, min_size_gb=min_size_gb, max_size_gb=max_size_gb, - for_analysis=False # Only operational shards + for_analysis=False, # Only operational shards ) - # Create a mapping of node names to available space node_space_map = {node.name: node.available_space_gb for node in self.nodes} # Sort by node available space (ascending, so low space nodes first), then by shard size - healthy_shards.sort(key=lambda s: (node_space_map.get(s.node_name, float('inf')), s.size_gb)) + healthy_shards.sort(key=lambda s: (node_space_map.get(s.node_name, float("inf")), s.size_gb)) return healthy_shards - def check_zone_balance(self, - table_name: Optional[str] = None, - tolerance_percent: float = 10.0) -> Dict[str, Dict[str, int]]: + def check_zone_balance( + self, table_name: Optional[str] = None, tolerance_percent: float = 10.0 + ) -> Dict[str, Dict[str, int]]: """Check if zones are balanced within tolerance""" # Filter shards by table if specified shards = self.shards @@ -169,21 +171,23 @@ def check_zone_balance(self, shards = [s for s in shards if s.table_name == table_name] # Count shards by zone and type - zone_stats = defaultdict(lambda: {'PRIMARY': 0, 'REPLICA': 0, 'TOTAL': 0}) + zone_stats = defaultdict(lambda: {"PRIMARY": 0, "REPLICA": 0, "TOTAL": 0}) for shard in shards: shard_type = shard.shard_type zone_stats[shard.zone][shard_type] += 1 - zone_stats[shard.zone]['TOTAL'] += 1 + zone_stats[shard.zone]["TOTAL"] += 1 return dict(zone_stats) - def find_nodes_with_capacity(self, - required_space_gb: float, - exclude_zones: Optional[Set[str]] = None, - exclude_nodes: Optional[Set[str]] = None, - min_free_space_gb: float = 100.0, - max_disk_usage_percent: float = 85.0) -> List[NodeInfo]: + def find_nodes_with_capacity( + self, + required_space_gb: float, + exclude_zones: Optional[Set[str]] = None, + exclude_nodes: Optional[Set[str]] = None, + min_free_space_gb: float = 100.0, + max_disk_usage_percent: float = 85.0, + ) -> List[NodeInfo]: """Find nodes that have capacity for additional shards Args: @@ -219,16 +223,18 @@ def find_nodes_with_capacity(self, available_nodes.sort(key=lambda n: n.available_space_gb, reverse=True) return available_nodes - def generate_rebalancing_recommendations(self, - table_name: Optional[str] = None, - min_size_gb: float = 40.0, - max_size_gb: float = 60.0, - zone_tolerance_percent: float = 10.0, - min_free_space_gb: float = 100.0, - max_recommendations: int = 10, - prioritize_space: bool = False, - source_node: Optional[str] = None, - max_disk_usage_percent: float = 90.0) -> List[MoveRecommendation]: + def generate_rebalancing_recommendations( + self, + table_name: Optional[str] = None, + min_size_gb: float = 40.0, + max_size_gb: float = 60.0, + zone_tolerance_percent: float = 10.0, + min_free_space_gb: float = 100.0, + max_recommendations: int = 10, + prioritize_space: bool = False, + source_node: Optional[str] = None, + max_disk_usage_percent: float = 90.0, + ) -> List[MoveRecommendation]: """Generate recommendations for rebalancing shards Args: @@ -241,7 +247,7 @@ def generate_rebalancing_recommendations(self, # Get moveable shards (only healthy ones for actual operations) moveable_shards = self.find_moveable_shards(min_size_gb, max_size_gb, table_name) - + print(f"Analyzing {len(moveable_shards)} candidate shards in size range {min_size_gb}-{max_size_gb}GB...") if not moveable_shards: @@ -251,7 +257,7 @@ def generate_rebalancing_recommendations(self, zone_stats = self.check_zone_balance(table_name, zone_tolerance_percent) # Calculate target distribution - total_shards = sum(stats['TOTAL'] for stats in zone_stats.values()) + total_shards = sum(stats["TOTAL"] for stats in zone_stats.values()) zones = list(zone_stats.keys()) target_per_zone = total_shards // len(zones) if zones else 0 @@ -260,10 +266,10 @@ def generate_rebalancing_recommendations(self, underloaded_zones = [] for zone, stats in zone_stats.items(): - current_count = stats['TOTAL'] + current_count = stats["TOTAL"] threshold_high = target_per_zone * (1 + zone_tolerance_percent / 100) threshold_low = target_per_zone * (1 - zone_tolerance_percent / 100) - + if current_count > threshold_high: overloaded_zones.append(zone) elif current_count < threshold_low: @@ -277,9 +283,9 @@ def generate_rebalancing_recommendations(self, processing_shards = moveable_shards # Generate move recommendations - safe_recommendations = 0 + safe_recommendations = 0 # noqa: F841 total_evaluated = 0 - + for i, shard in enumerate(processing_shards): if len(recommendations) >= max_recommendations: break @@ -287,9 +293,9 @@ def generate_rebalancing_recommendations(self, # Show progress every 50 shards when processing many if len(processing_shards) > 100 and i > 0 and i % 50 == 0: print(".", end="", flush=True) - + total_evaluated += 1 - + # Skip based on priority mode if not prioritize_space: # Zone balancing mode: only move shards from overloaded zones @@ -302,14 +308,14 @@ def generate_rebalancing_recommendations(self, required_space_gb=shard.size_gb, exclude_nodes={shard.node_name}, # Don't move to same node min_free_space_gb=min_free_space_gb, - max_disk_usage_percent=max_disk_usage_percent + max_disk_usage_percent=max_disk_usage_percent, ) # Quick pre-filter to avoid expensive safety validations # Only check nodes in different zones (for zone balancing) if not prioritize_space: target_nodes = [node for node in target_nodes if node.zone != shard.zone] - + # Limit to top 3 candidates to reduce validation overhead target_nodes = target_nodes[:3] @@ -327,9 +333,9 @@ def generate_rebalancing_recommendations(self, to_zone=candidate_node.zone, shard_type=shard.shard_type, size_gb=shard.size_gb, - reason="Safety validation" + reason="Safety validation", ) - + # Check if this move would be safe is_safe, safety_msg = self.validate_move_safety(temp_rec, max_disk_usage_percent) if is_safe: @@ -389,7 +395,7 @@ def generate_rebalancing_recommendations(self, to_zone=target_node.zone, shard_type=shard.shard_type, size_gb=shard.size_gb, - reason=reason + reason=reason, ) recommendations.append(recommendation) @@ -400,8 +406,9 @@ def generate_rebalancing_recommendations(self, print(f"Performance: {self.get_cache_stats()}") return recommendations - def validate_move_safety(self, recommendation: MoveRecommendation, - max_disk_usage_percent: float = 90.0) -> Tuple[bool, str]: + def validate_move_safety( + self, recommendation: MoveRecommendation, max_disk_usage_percent: float = 90.0 + ) -> Tuple[bool, str]: """Validate that a move recommendation is safe to execute""" # Find target node (with caching) target_node = self._get_node_cached(recommendation.to_node) @@ -417,20 +424,24 @@ def validate_move_safety(self, recommendation: MoveRecommendation, # Check available space required_space_gb = recommendation.size_gb + 50 # 50GB buffer if target_node.available_space_gb < required_space_gb: - return False, f"Insufficient space on target node (need {required_space_gb:.1f}GB, have {target_node.available_space_gb:.1f}GB)" + return ( + False, + f"Insufficient space on target node (need {required_space_gb:.1f}GB, " + f"have {target_node.available_space_gb:.1f}GB)", + ) # Check disk usage if target_node.disk_usage_percent > max_disk_usage_percent: return False, f"Target node disk usage too high ({target_node.disk_usage_percent:.1f}%)" return True, "Move appears safe" - + def _get_node_cached(self, node_name: str): """Get node by name with caching""" if node_name in self._node_lookup_cache: self._cache_hits += 1 return self._node_lookup_cache[node_name] - + # Find node (cache miss) self._cache_misses += 1 target_node = None @@ -438,64 +449,65 @@ def _get_node_cached(self, node_name: str): if node.name == node_name: target_node = node break - + self._node_lookup_cache[node_name] = target_node return target_node - + def _check_zone_conflict_cached(self, recommendation: MoveRecommendation) -> Optional[str]: """Check zone conflicts with caching""" # Create cache key: table, shard, target zone target_zone = self._get_node_zone(recommendation.to_node) cache_key = (recommendation.table_name, recommendation.shard_id, target_zone) - + if cache_key in self._zone_conflict_cache: self._cache_hits += 1 return self._zone_conflict_cache[cache_key] - + # Cache miss - do expensive check self._cache_misses += 1 result = self._check_zone_conflict(recommendation) self._zone_conflict_cache[cache_key] = result return result - + def _get_node_zone(self, node_name: str) -> str: """Get zone for a node name""" node = self._get_node_cached(node_name) return node.zone if node else "unknown" - + def get_cache_stats(self) -> str: """Get cache performance statistics""" total = self._cache_hits + self._cache_misses if total == 0: return "Cache stats: No operations yet" - + hit_rate = (self._cache_hits / total) * 100 return f"Cache stats: {hit_rate:.1f}% hit rate ({self._cache_hits} hits, {self._cache_misses} misses)" - - def _find_nodes_with_capacity_cached(self, required_space_gb: float, exclude_nodes: set, - min_free_space_gb: float, max_disk_usage_percent: float) -> List[NodeInfo]: + + def _find_nodes_with_capacity_cached( + self, required_space_gb: float, exclude_nodes: set, min_free_space_gb: float, max_disk_usage_percent: float + ) -> List[NodeInfo]: """Find nodes with capacity using caching for repeated queries""" # Create cache key based on parameters (rounded to avoid float precision issues) cache_key = ( round(required_space_gb, 1), frozenset(exclude_nodes), round(min_free_space_gb, 1), - round(max_disk_usage_percent, 1) + round(max_disk_usage_percent, 1), ) - + if cache_key in self._target_nodes_cache: self._cache_hits += 1 return self._target_nodes_cache[cache_key] - + # Cache miss - do expensive calculation self._cache_misses += 1 result = self.find_nodes_with_capacity( required_space_gb=required_space_gb, exclude_nodes=exclude_nodes, min_free_space_gb=min_free_space_gb, - max_disk_usage_percent=max_disk_usage_percent + max_disk_usage_percent=max_disk_usage_percent, ) - + self._target_nodes_cache[cache_key] = result return result @@ -526,14 +538,15 @@ def _check_zone_conflict(self, recommendation: MoveRecommendation) -> Optional[s ORDER BY s."primary" DESC, zone, node_name """ - result = self.client.execute_query(query, [ - recommendation.table_name, - recommendation.schema_name, - recommendation.shard_id - ]) + result = self.client.execute_query( + query, [recommendation.table_name, recommendation.schema_name, recommendation.shard_id] + ) - if not result.get('rows'): - return f"Cannot find shard {recommendation.shard_id} for table {recommendation.schema_name}.{recommendation.table_name}" + if not result.get("rows"): + return ( + f"Cannot find shard {recommendation.shard_id} " + f"for table {recommendation.schema_name}.{recommendation.table_name}" + ) # Analyze current distribution zones_with_copies = set() @@ -552,23 +565,23 @@ def _check_zone_conflict(self, recommendation: MoveRecommendation) -> Optional[s if not target_node_id: return f"Target node {recommendation.to_node} not found in cluster" - for row in result['rows']: + for row in result["rows"]: node_id, node_name, zone, is_primary, routing_state, state = row - zone = zone or 'unknown' + zone = zone or "unknown" total_copies += 1 # Track the shard we're planning to move if node_name == recommendation.from_node: current_location = { - 'zone': zone, - 'is_primary': is_primary, - 'routing_state': routing_state, - 'state': state + "zone": zone, + "is_primary": is_primary, + "routing_state": routing_state, + "state": state, } # Track all copies for conflict detection nodes_with_copies.add(node_id) - if routing_state == 'STARTED' and state == 'STARTED': + if routing_state == "STARTED" and state == "STARTED": healthy_copies += 1 zones_with_copies.add(zone) @@ -576,23 +589,29 @@ def _check_zone_conflict(self, recommendation: MoveRecommendation) -> Optional[s if not current_location: return f"Shard not found on source node {recommendation.from_node}" - if current_location['routing_state'] != 'STARTED': + if current_location["routing_state"] != "STARTED": return f"Source shard is not in STARTED state (current: {current_location['routing_state']})" # CRITICAL CHECK 1: Target node already has a copy of this shard if target_node_id in nodes_with_copies: - return f"Node conflict: Target node {recommendation.to_node} already has a copy of shard {recommendation.shard_id}" + return ( + f"Node conflict: Target node {recommendation.to_node} " + f"already has a copy of shard {recommendation.shard_id}" + ) # CRITICAL CHECK 2: Target zone already has a copy (zone allocation limits) if recommendation.to_zone in zones_with_copies: return f"Zone conflict: {recommendation.to_zone} already has a copy of shard {recommendation.shard_id}" # CRITICAL CHECK 3: Ensure we're not creating a single point of failure - if len(zones_with_copies) == 1 and current_location['zone'] in zones_with_copies: + if len(zones_with_copies) == 1 and current_location["zone"] in zones_with_copies: # This is the only zone with this shard - moving it is good for zone distribution pass elif len(zones_with_copies) <= 1 and healthy_copies <= 1: - return f"Safety concern: Only {healthy_copies} healthy copy(ies) exist. Moving might risk data availability." + return ( + f"Safety concern: Only {healthy_copies} healthy copy(ies) exist. " + f"Moving might risk data availability." + ) # ADDITIONAL CHECK: Verify zone allocation constraints for this table table_zone_query = """ @@ -609,21 +628,22 @@ def _check_zone_conflict(self, recommendation: MoveRecommendation) -> Optional[s ORDER BY zone """ - zone_result = self.client.execute_query(table_zone_query, [ - recommendation.table_name, - recommendation.schema_name, - recommendation.shard_id - ]) + zone_result = self.client.execute_query( + table_zone_query, [recommendation.table_name, recommendation.schema_name, recommendation.shard_id] + ) current_zone_counts = {} - for row in zone_result.get('rows', []): + for row in zone_result.get("rows", []): zone_name, count = row - current_zone_counts[zone_name or 'unknown'] = count + current_zone_counts[zone_name or "unknown"] = count # Check if adding to target zone would violate balance target_zone_count = current_zone_counts.get(recommendation.to_zone, 0) if target_zone_count > 0: - return f"Zone allocation violation: {recommendation.to_zone} would have {target_zone_count + 1} copies after move" + return ( + f"Zone allocation violation: {recommendation.to_zone} " + f"would have {target_zone_count + 1} copies after move." + ) return None @@ -635,71 +655,70 @@ def get_cluster_overview(self) -> Dict[str, Any]: """Get a comprehensive overview of the cluster""" # Get cluster watermark settings watermarks = self.client.get_cluster_watermarks() - + overview = { - 'nodes': len(self.nodes), - 'zones': len(set(node.zone for node in self.nodes)), - 'total_shards': len(self.shards), - 'primary_shards': len([s for s in self.shards if s.is_primary]), - 'replica_shards': len([s for s in self.shards if not s.is_primary]), - 'total_size_gb': sum(s.size_gb for s in self.shards), - 'zone_distribution': defaultdict(int), - 'node_health': [], - 'watermarks': watermarks + "nodes": len(self.nodes), + "zones": len({node.zone for node in self.nodes}), + "total_shards": len(self.shards), + "primary_shards": len([s for s in self.shards if s.is_primary]), + "replica_shards": len([s for s in self.shards if not s.is_primary]), + "total_size_gb": sum(s.size_gb for s in self.shards), + "zone_distribution": defaultdict(int), + "node_health": [], + "watermarks": watermarks, } # Zone distribution for shard in self.shards: - overview['zone_distribution'][shard.zone] += 1 - overview['zone_distribution'] = dict(overview['zone_distribution']) + overview["zone_distribution"][shard.zone] += 1 + overview["zone_distribution"] = dict(overview["zone_distribution"]) # Node health with watermark calculations for node in self.nodes: node_shards = [s for s in self.shards if s.node_name == node.name] watermark_info = self._calculate_node_watermark_remaining(node, watermarks) - - overview['node_health'].append({ - 'name': node.name, - 'zone': node.zone, - 'shards': len(node_shards), - 'size_gb': sum(s.size_gb for s in node_shards), - 'disk_usage_percent': node.disk_usage_percent, - 'heap_usage_percent': node.heap_usage_percent, - 'available_space_gb': node.available_space_gb, - 'remaining_to_low_watermark_gb': watermark_info['remaining_to_low_gb'], - 'remaining_to_high_watermark_gb': watermark_info['remaining_to_high_gb'] - }) + + overview["node_health"].append( + { + "name": node.name, + "zone": node.zone, + "shards": len(node_shards), + "size_gb": sum(s.size_gb for s in node_shards), + "disk_usage_percent": node.disk_usage_percent, + "heap_usage_percent": node.heap_usage_percent, + "available_space_gb": node.available_space_gb, + "remaining_to_low_watermark_gb": watermark_info["remaining_to_low_gb"], + "remaining_to_high_watermark_gb": watermark_info["remaining_to_high_gb"], + } + ) return overview - def _calculate_node_watermark_remaining(self, node: 'NodeInfo', watermarks: Dict[str, Any]) -> Dict[str, float]: + def _calculate_node_watermark_remaining(self, node: "NodeInfo", watermarks: Dict[str, Any]) -> Dict[str, float]: """Calculate remaining space until watermarks are reached""" - + # Parse watermark percentages - low_watermark = self._parse_watermark_percentage(watermarks.get('low', '85%')) - high_watermark = self._parse_watermark_percentage(watermarks.get('high', '90%')) - + low_watermark = self._parse_watermark_percentage(watermarks.get("low", "85%")) + high_watermark = self._parse_watermark_percentage(watermarks.get("high", "90%")) + # Calculate remaining space to each watermark total_space_bytes = node.fs_total current_used_bytes = node.fs_used - + # Space that would be used at each watermark low_watermark_used_bytes = total_space_bytes * (low_watermark / 100.0) high_watermark_used_bytes = total_space_bytes * (high_watermark / 100.0) - + # Remaining space until each watermark (negative if already exceeded) remaining_to_low_gb = max(0, (low_watermark_used_bytes - current_used_bytes) / (1024**3)) remaining_to_high_gb = max(0, (high_watermark_used_bytes - current_used_bytes) / (1024**3)) - - return { - 'remaining_to_low_gb': remaining_to_low_gb, - 'remaining_to_high_gb': remaining_to_high_gb - } - + + return {"remaining_to_low_gb": remaining_to_low_gb, "remaining_to_high_gb": remaining_to_high_gb} + def _parse_watermark_percentage(self, watermark_value: str) -> float: """Parse watermark percentage from string like '85%' or '0.85'""" if isinstance(watermark_value, str): - if watermark_value.endswith('%'): + if watermark_value.endswith("%"): return float(watermark_value[:-1]) else: # Handle decimal format like '0.85' @@ -715,8 +734,7 @@ def _parse_watermark_percentage(self, watermark_value: str) -> float: # Default to common values if parsing fails return 85.0 # Default low watermark - def plan_node_decommission(self, node_name: str, - min_free_space_gb: float = 100.0) -> Dict[str, Any]: + def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100.0) -> Dict[str, Any]: """Plan the decommissioning of a node by analyzing required shard moves Args: @@ -734,26 +752,21 @@ def plan_node_decommission(self, node_name: str, break if not target_node: - return { - 'error': f"Node {node_name} not found in cluster", - 'feasible': False - } + return {"error": f"Node {node_name} not found in cluster", "feasible": False} # Get all shards on this node (only healthy ones for safety) - node_shards = [s for s in self.shards - if s.node_name == node_name - and s.routing_state == 'STARTED'] + node_shards = [s for s in self.shards if s.node_name == node_name and s.routing_state == "STARTED"] if not node_shards: return { - 'node': node_name, - 'zone': target_node.zone, - 'feasible': True, - 'shards_to_move': 0, - 'total_size_gb': 0, - 'recommendations': [], - 'warnings': [], - 'message': 'Node has no healthy shards - safe to decommission' + "node": node_name, + "zone": target_node.zone, + "feasible": True, + "shards_to_move": 0, + "total_size_gb": 0, + "recommendations": [], + "warnings": [], + "message": "Node has no healthy shards - safe to decommission", } # Calculate space requirements @@ -767,17 +780,17 @@ def plan_node_decommission(self, node_name: str, for shard in node_shards: # Find nodes that can accommodate this shard potential_targets = self.find_nodes_with_capacity( - shard.size_gb, - exclude_nodes={node_name}, - min_free_space_gb=min_free_space_gb + shard.size_gb, exclude_nodes={node_name}, min_free_space_gb=min_free_space_gb ) if not potential_targets: - infeasible_moves.append({ - 'shard': f"{shard.schema_name}.{shard.table_name}[{shard.shard_id}]", - 'size_gb': shard.size_gb, - 'reason': 'No nodes with sufficient capacity' - }) + infeasible_moves.append( + { + "shard": f"{shard.schema_name}.{shard.table_name}[{shard.shard_id}]", + "size_gb": shard.size_gb, + "reason": "No nodes with sufficient capacity", + } + ) continue # Check for zone conflicts @@ -794,36 +807,42 @@ def plan_node_decommission(self, node_name: str, to_zone=target.zone, shard_type=shard.shard_type, size_gb=shard.size_gb, - reason=f"Node decommission: {node_name}" + reason=f"Node decommission: {node_name}", ) zone_conflict = self._check_zone_conflict(temp_rec) if not zone_conflict: safe_targets.append(target) else: - warnings.append(f"Zone conflict for {shard.schema_name}.{shard.table_name}[{shard.shard_id}]: {zone_conflict}") + warnings.append( + f"Zone conflict for {shard.schema_name}.{shard.table_name}[{shard.shard_id}]: {zone_conflict}" + ) if safe_targets: # Choose the target with most available space best_target = safe_targets[0] - move_plan.append(MoveRecommendation( - table_name=shard.table_name, - schema_name=shard.schema_name, - shard_id=shard.shard_id, - from_node=node_name, - to_node=best_target.name, - from_zone=shard.zone, - to_zone=best_target.zone, - shard_type=shard.shard_type, - size_gb=shard.size_gb, - reason=f"Node decommission: {node_name}" - )) + move_plan.append( + MoveRecommendation( + table_name=shard.table_name, + schema_name=shard.schema_name, + shard_id=shard.shard_id, + from_node=node_name, + to_node=best_target.name, + from_zone=shard.zone, + to_zone=best_target.zone, + shard_type=shard.shard_type, + size_gb=shard.size_gb, + reason=f"Node decommission: {node_name}", + ) + ) else: - infeasible_moves.append({ - 'shard': f"{shard.schema_name}.{shard.table_name}[{shard.shard_id}]", - 'size_gb': shard.size_gb, - 'reason': 'Zone conflicts prevent safe move' - }) + infeasible_moves.append( + { + "shard": f"{shard.schema_name}.{shard.table_name}[{shard.shard_id}]", + "size_gb": shard.size_gb, + "reason": "Zone conflicts prevent safe move", + } + ) # Determine feasibility feasible = len(infeasible_moves) == 0 @@ -833,144 +852,139 @@ def plan_node_decommission(self, node_name: str, # Check if remaining cluster capacity is sufficient after decommission remaining_capacity = sum(n.available_space_gb for n in self.nodes if n.name != node_name) if remaining_capacity < total_size_gb * 1.2: # 20% safety margin - warnings.append(f"Low remaining capacity after decommission. Only {remaining_capacity:.1f}GB available for {total_size_gb:.1f}GB of data") + warnings.append( + f"Low remaining capacity after decommission. " + f"Only {remaining_capacity:.1f}GB available for {total_size_gb:.1f}GB of data" + ) return { - 'node': node_name, - 'zone': target_node.zone, - 'feasible': feasible, - 'shards_to_move': len(node_shards), - 'moveable_shards': len(move_plan), - 'total_size_gb': total_size_gb, - 'recommendations': move_plan, - 'infeasible_moves': infeasible_moves, - 'warnings': warnings, - 'estimated_time_hours': len(move_plan) * 0.1, # Rough estimate: 6 minutes per move - 'message': 'Decommission plan generated' if feasible else 'Decommission not currently feasible' + "node": node_name, + "zone": target_node.zone, + "feasible": feasible, + "shards_to_move": len(node_shards), + "moveable_shards": len(move_plan), + "total_size_gb": total_size_gb, + "recommendations": move_plan, + "infeasible_moves": infeasible_moves, + "warnings": warnings, + "estimated_time_hours": len(move_plan) * 0.1, # Rough estimate: 6 minutes per move + "message": "Decommission plan generated" if feasible else "Decommission not currently feasible", } class RecoveryMonitor: """Monitor shard recovery operations""" - + def __init__(self, client: CrateDBClient): self.client = client - - def get_cluster_recovery_status(self, - table_name: Optional[str] = None, - node_name: Optional[str] = None, - recovery_type_filter: str = 'all', - include_transitioning: bool = False) -> List[RecoveryInfo]: + + def get_cluster_recovery_status( + self, + table_name: Optional[str] = None, + node_name: Optional[str] = None, + recovery_type_filter: str = "all", + include_transitioning: bool = False, + ) -> List[RecoveryInfo]: """Get comprehensive recovery status with minimal cluster impact""" - + # Get all recovering shards using the efficient combined query recoveries = self.client.get_all_recovering_shards(table_name, node_name, include_transitioning) - + # Apply recovery type filter - if recovery_type_filter != 'all': + if recovery_type_filter != "all": recoveries = [r for r in recoveries if r.recovery_type.upper() == recovery_type_filter.upper()] - + return recoveries - + def get_recovery_summary(self, recoveries: List[RecoveryInfo]) -> Dict[str, Any]: """Generate a summary of recovery operations""" - + if not recoveries: - return { - 'total_recoveries': 0, - 'by_type': {}, - 'by_stage': {}, - 'avg_progress': 0.0, - 'total_size_gb': 0.0 - } - + return {"total_recoveries": 0, "by_type": {}, "by_stage": {}, "avg_progress": 0.0, "total_size_gb": 0.0} + # Group by recovery type by_type = {} by_stage = {} total_progress = 0.0 total_size_gb = 0.0 - + for recovery in recoveries: # By type if recovery.recovery_type not in by_type: - by_type[recovery.recovery_type] = { - 'count': 0, - 'total_size_gb': 0.0, - 'avg_progress': 0.0 - } - by_type[recovery.recovery_type]['count'] += 1 - by_type[recovery.recovery_type]['total_size_gb'] += recovery.size_gb - + by_type[recovery.recovery_type] = {"count": 0, "total_size_gb": 0.0, "avg_progress": 0.0} + by_type[recovery.recovery_type]["count"] += 1 + by_type[recovery.recovery_type]["total_size_gb"] += recovery.size_gb + # By stage if recovery.stage not in by_stage: by_stage[recovery.stage] = 0 by_stage[recovery.stage] += 1 - + # Totals total_progress += recovery.overall_progress total_size_gb += recovery.size_gb - + # Calculate averages for type_name, rec_type in by_type.items(): - if rec_type['count'] > 0: + if rec_type["count"] > 0: type_recoveries = [r for r in recoveries if r.recovery_type == type_name] if type_recoveries: - rec_type['avg_progress'] = sum(r.overall_progress for r in type_recoveries) / len(type_recoveries) - + rec_type["avg_progress"] = sum(r.overall_progress for r in type_recoveries) / len(type_recoveries) + return { - 'total_recoveries': len(recoveries), - 'by_type': by_type, - 'by_stage': by_stage, - 'avg_progress': total_progress / len(recoveries) if recoveries else 0.0, - 'total_size_gb': total_size_gb + "total_recoveries": len(recoveries), + "by_type": by_type, + "by_stage": by_stage, + "avg_progress": total_progress / len(recoveries) if recoveries else 0.0, + "total_size_gb": total_size_gb, } - + def format_recovery_display(self, recoveries: List[RecoveryInfo]) -> str: """Format recovery information for display""" - + if not recoveries: return "βœ… No active shard recoveries found" - + # Group by recovery type - peer_recoveries = [r for r in recoveries if r.recovery_type == 'PEER'] - disk_recoveries = [r for r in recoveries if r.recovery_type == 'DISK'] - other_recoveries = [r for r in recoveries if r.recovery_type not in ['PEER', 'DISK']] - + peer_recoveries = [r for r in recoveries if r.recovery_type == "PEER"] + disk_recoveries = [r for r in recoveries if r.recovery_type == "DISK"] + other_recoveries = [r for r in recoveries if r.recovery_type not in ["PEER", "DISK"]] + output = [f"\nπŸ”„ Active Shard Recoveries ({len(recoveries)} total)"] output.append("=" * 80) - + if peer_recoveries: output.append(f"\nπŸ“‘ PEER Recoveries ({len(peer_recoveries)})") output.append(self._format_recovery_table(peer_recoveries)) - + if disk_recoveries: output.append(f"\nπŸ’Ύ DISK Recoveries ({len(disk_recoveries)})") output.append(self._format_recovery_table(disk_recoveries)) - + if other_recoveries: output.append(f"\nπŸ”§ Other Recoveries ({len(other_recoveries)})") output.append(self._format_recovery_table(other_recoveries)) - + # Add summary summary = self.get_recovery_summary(recoveries) - output.append(f"\nπŸ“Š Summary:") + output.append("\nπŸ“Š Summary:") output.append(f" Total size: {summary['total_size_gb']:.1f} GB") output.append(f" Average progress: {summary['avg_progress']:.1f}%") - + return "\n".join(output) - + def _format_recovery_table(self, recoveries: List[RecoveryInfo]) -> str: """Format a table of recovery information""" - + if not recoveries: return " No recoveries of this type" - + # Table headers headers = ["Table", "Shard", "Node", "Type", "Stage", "Progress", "Size(GB)", "Time(s)"] - + # Calculate column widths col_widths = [len(h) for h in headers] - + rows = [] for recovery in recoveries: row = [ @@ -981,25 +995,25 @@ def _format_recovery_table(self, recoveries: List[RecoveryInfo]) -> str: recovery.stage, f"{recovery.overall_progress:.1f}%", f"{recovery.size_gb:.1f}", - f"{recovery.total_time_seconds:.1f}" + f"{recovery.total_time_seconds:.1f}", ] rows.append(row) - + # Update column widths for i, cell in enumerate(row): col_widths[i] = max(col_widths[i], len(cell)) - + # Format table output = [] - + # Header row header_row = " " + " | ".join(h.ljust(w) for h, w in zip(headers, col_widths)) output.append(header_row) output.append(" " + "-" * (len(header_row) - 3)) - + # Data rows for row in rows: data_row = " " + " | ".join(cell.ljust(w) for cell, w in zip(row, col_widths)) output.append(data_row) - + return "\n".join(output) diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index e4f5800d..2ce29fdf 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -4,23 +4,16 @@ import sys import time -import os from typing import Optional -try: - import click - from rich.console import Console - from rich.table import Table - from rich.panel import Panel - from rich.text import Text - from rich import box -except ImportError as e: - print(f"Missing required dependency: {e}") - print("Please install dependencies with: pip install -e .") - sys.exit(1) -from .database import CrateDBClient -from .analyzer import ShardAnalyzer, RecoveryMonitor +import click +from rich import box +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from .analyzer import MoveRecommendation, RecoveryMonitor, ShardAnalyzer +from .database import CrateDBClient console = Console() @@ -28,11 +21,11 @@ def format_size(size_gb: float) -> str: """Format size in GB with appropriate precision""" if size_gb >= 1000: - return f"{size_gb/1000:.1f}TB" + return f"{size_gb / 1000:.1f}TB" elif size_gb >= 1: return f"{size_gb:.1f}GB" else: - return f"{size_gb*1000:.0f}MB" + return f"{size_gb * 1000:.0f}MB" def format_percentage(value: float) -> str: @@ -48,13 +41,13 @@ def format_percentage(value: float) -> str: def format_translog_info(recovery_info) -> str: """Format translog size information with color coding""" tl_bytes = recovery_info.translog_size_bytes - + # Only show if significant (>10MB for production) if tl_bytes < 10 * 1024 * 1024: # 10MB for production return "" - + tl_gb = recovery_info.translog_size_gb - + # Color coding based on size if tl_gb >= 5.0: color = "red" @@ -62,13 +55,13 @@ def format_translog_info(recovery_info) -> str: color = "yellow" else: color = "green" - + # Format size if tl_gb >= 1.0: size_str = f"{tl_gb:.1f}GB" else: - size_str = f"{tl_gb*1000:.0f}MB" - + size_str = f"{tl_gb * 1000:.0f}MB" + return f" [dim]([{color}]TL:{size_str}[/{color}])[/dim]" @@ -90,18 +83,18 @@ def main(ctx): console.print("[red]Error: Could not connect to CrateDB[/red]") console.print("Please check your CRATE_CONNECTION_STRING in .env file") sys.exit(1) - ctx.obj['client'] = client + ctx.obj["client"] = client except Exception as e: console.print(f"[red]Error connecting to CrateDB: {e}[/red]") sys.exit(1) @main.command() -@click.option('--table', '-t', help='Analyze specific table only') +@click.option("--table", "-t", help="Analyze specific table only") @click.pass_context def analyze(ctx, table: Optional[str]): """Analyze current shard distribution across nodes and zones""" - client = ctx.obj['client'] + client = ctx.obj["client"] analyzer = ShardAnalyzer(client) console.print(Panel.fit("[bold blue]CrateDB Cluster Analysis[/bold blue]")) @@ -114,27 +107,29 @@ def analyze(ctx, table: Optional[str]): summary_table.add_column("Metric", style="cyan") summary_table.add_column("Value", style="magenta") - summary_table.add_row("Nodes", str(overview['nodes'])) - summary_table.add_row("Availability Zones", str(overview['zones'])) - summary_table.add_row("Total Shards", str(overview['total_shards'])) - summary_table.add_row("Primary Shards", str(overview['primary_shards'])) - summary_table.add_row("Replica Shards", str(overview['replica_shards'])) - summary_table.add_row("Total Size", format_size(overview['total_size_gb'])) + summary_table.add_row("Nodes", str(overview["nodes"])) + summary_table.add_row("Availability Zones", str(overview["zones"])) + summary_table.add_row("Total Shards", str(overview["total_shards"])) + summary_table.add_row("Primary Shards", str(overview["primary_shards"])) + summary_table.add_row("Replica Shards", str(overview["replica_shards"])) + summary_table.add_row("Total Size", format_size(overview["total_size_gb"])) console.print(summary_table) console.print() # Disk watermarks table - if overview.get('watermarks'): + if overview.get("watermarks"): watermarks_table = Table(title="Disk Allocation Watermarks", box=box.ROUNDED) watermarks_table.add_column("Setting", style="cyan") watermarks_table.add_column("Value", style="magenta") - watermarks = overview['watermarks'] - watermarks_table.add_row("Low Watermark", str(watermarks.get('low', 'Not set'))) - watermarks_table.add_row("High Watermark", str(watermarks.get('high', 'Not set'))) - watermarks_table.add_row("Flood Stage", str(watermarks.get('flood_stage', 'Not set'))) - watermarks_table.add_row("Enable for Single Node", str(watermarks.get('enable_for_single_data_node', 'Not set'))) + watermarks = overview["watermarks"] + watermarks_table.add_row("Low Watermark", str(watermarks.get("low", "Not set"))) + watermarks_table.add_row("High Watermark", str(watermarks.get("high", "Not set"))) + watermarks_table.add_row("Flood Stage", str(watermarks.get("flood_stage", "Not set"))) + watermarks_table.add_row( + "Enable for Single Node", str(watermarks.get("enable_for_single_data_node", "Not set")) + ) console.print(watermarks_table) console.print() @@ -145,8 +140,8 @@ def analyze(ctx, table: Optional[str]): zone_table.add_column("Shards", justify="right", style="magenta") zone_table.add_column("Percentage", justify="right", style="green") - total_shards = overview['total_shards'] - for zone, count in overview['zone_distribution'].items(): + total_shards = overview["total_shards"] + for zone, count in overview["zone_distribution"].items(): percentage = (count / total_shards * 100) if total_shards > 0 else 0 zone_table.add_row(zone, str(count), f"{percentage:.1f}%") @@ -164,20 +159,28 @@ def analyze(ctx, table: Optional[str]): node_table.add_column("Until Low WM", justify="right", style="yellow") node_table.add_column("Until High WM", justify="right", style="red") - for node_info in overview['node_health']: + for node_info in overview["node_health"]: # Format watermark remaining capacity - low_wm_remaining = format_size(node_info['remaining_to_low_watermark_gb']) if node_info['remaining_to_low_watermark_gb'] > 0 else "[red]Exceeded[/red]" - high_wm_remaining = format_size(node_info['remaining_to_high_watermark_gb']) if node_info['remaining_to_high_watermark_gb'] > 0 else "[red]Exceeded[/red]" + low_wm_remaining = ( + format_size(node_info["remaining_to_low_watermark_gb"]) + if node_info["remaining_to_low_watermark_gb"] > 0 + else "[red]Exceeded[/red]" + ) + high_wm_remaining = ( + format_size(node_info["remaining_to_high_watermark_gb"]) + if node_info["remaining_to_high_watermark_gb"] > 0 + else "[red]Exceeded[/red]" + ) node_table.add_row( - node_info['name'], - node_info['zone'], - str(node_info['shards']), - format_size(node_info['size_gb']), - format_percentage(node_info['disk_usage_percent']), - format_size(node_info['available_space_gb']), + node_info["name"], + node_info["zone"], + str(node_info["shards"]), + format_size(node_info["size_gb"]), + format_percentage(node_info["disk_usage_percent"]), + format_size(node_info["available_space_gb"]), low_wm_remaining, - high_wm_remaining + high_wm_remaining, ) console.print(node_table) @@ -202,11 +205,11 @@ def analyze(ctx, table: Optional[str]): @main.command() -@click.option('--table', '-t', help='Find candidates for specific table only') -@click.option('--min-size', default=40.0, help='Minimum shard size in GB (default: 40)') -@click.option('--max-size', default=60.0, help='Maximum shard size in GB (default: 60)') -@click.option('--limit', default=20, help='Maximum number of candidates to show (default: 20)') -@click.option('--node', help='Only show candidates from this specific source node (e.g., data-hot-4)') +@click.option("--table", "-t", help="Find candidates for specific table only") +@click.option("--min-size", default=40.0, help="Minimum shard size in GB (default: 40)") +@click.option("--max-size", default=60.0, help="Maximum shard size in GB (default: 60)") +@click.option("--limit", default=20, help="Maximum number of candidates to show (default: 20)") +@click.option("--node", help="Only show candidates from this specific source node (e.g., data-hot-4)") @click.pass_context def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, limit: int, node: Optional[str]): """Find shard candidates for movement based on size criteria @@ -214,7 +217,7 @@ def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, Results are sorted by nodes with least available space first, then by shard size (smallest first) for easier moves. """ - client = ctx.obj['client'] + client = ctx.obj["client"] analyzer = ShardAnalyzer(client) console.print(Panel.fit(f"[bold blue]Finding Moveable Shards ({min_size}-{max_size}GB)[/bold blue]")) @@ -232,7 +235,7 @@ def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, if not candidates: if node: console.print(f"[yellow]No moveable shards found on node '{node}' in the specified size range.[/yellow]") - console.print(f"[dim]Tip: Try different size ranges or remove --node filter to see all candidates[/dim]") + console.print("[dim]Tip: Try different size ranges or remove --node filter to see all candidates[/dim]") else: console.print("[yellow]No moveable shards found in the specified size range.[/yellow]") return @@ -240,7 +243,9 @@ def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, # Show limited results shown_candidates = candidates[:limit] - candidates_table = Table(title=f"Moveable Shard Candidates (showing {len(shown_candidates)} of {len(candidates)})", box=box.ROUNDED) + candidates_table = Table( + title=f"Moveable Shard Candidates (showing {len(shown_candidates)} of {len(candidates)})", box=box.ROUNDED + ) candidates_table.add_column("Table", style="cyan") candidates_table.add_column("Shard ID", justify="right", style="magenta") candidates_table.add_column("Type", style="blue") @@ -263,7 +268,7 @@ def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, shard.zone, format_size(shard.size_gb), format_size(node_free_space), - f"{shard.num_docs:,}" + f"{shard.num_docs:,}", ) console.print(candidates_table) @@ -273,26 +278,51 @@ def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, @main.command() -@click.option('--table', '-t', help='Generate recommendations for specific table only') -@click.option('--min-size', default=40.0, help='Minimum shard size in GB (default: 40)') -@click.option('--max-size', default=60.0, help='Maximum shard size in GB (default: 60)') -@click.option('--zone-tolerance', default=10.0, help='Zone balance tolerance percentage (default: 10)') -@click.option('--min-free-space', default=100.0, help='Minimum free space required on target nodes in GB (default: 100)') -@click.option('--max-moves', default=10, help='Maximum number of move recommendations (default: 10)') -@click.option('--max-disk-usage', default=90.0, help='Maximum disk usage percentage for target nodes (default: 90)') - -@click.option('--validate/--no-validate', default=True, help='Validate move safety (default: True)') -@click.option('--prioritize-space/--prioritize-zones', default=False, help='Prioritize available space over zone balancing (default: False)') -@click.option('--dry-run/--execute', default=True, help='Show what would be done without generating SQL commands (default: True)') -@click.option('--auto-execute', is_flag=True, default=False, help='DANGER: Automatically execute the SQL commands (requires --execute, asks for confirmation)') -@click.option('--node', help='Only recommend moves from this specific source node (e.g., data-hot-4)') +@click.option("--table", "-t", help="Generate recommendations for specific table only") +@click.option("--min-size", default=40.0, help="Minimum shard size in GB (default: 40)") +@click.option("--max-size", default=60.0, help="Maximum shard size in GB (default: 60)") +@click.option("--zone-tolerance", default=10.0, help="Zone balance tolerance percentage (default: 10)") +@click.option( + "--min-free-space", default=100.0, help="Minimum free space required on target nodes in GB (default: 100)" +) +@click.option("--max-moves", default=10, help="Maximum number of move recommendations (default: 10)") +@click.option("--max-disk-usage", default=90.0, help="Maximum disk usage percentage for target nodes (default: 90)") +@click.option("--validate/--no-validate", default=True, help="Validate move safety (default: True)") +@click.option( + "--prioritize-space/--prioritize-zones", + default=False, + help="Prioritize available space over zone balancing (default: False)", +) +@click.option( + "--dry-run/--execute", default=True, help="Show what would be done without generating SQL commands (default: True)" +) +@click.option( + "--auto-execute", + is_flag=True, + default=False, + help="DANGER: Automatically execute the SQL commands (requires --execute, asks for confirmation)", +) +@click.option("--node", help="Only recommend moves from this specific source node (e.g., data-hot-4)") @click.pass_context -def recommend(ctx, table: Optional[str], min_size: float, max_size: float, - zone_tolerance: float, min_free_space: float, max_moves: int, max_disk_usage: float, validate: bool, prioritize_space: bool, dry_run: bool, auto_execute: bool, node: Optional[str]): +def recommend( + ctx, + table: Optional[str], + min_size: float, + max_size: float, + zone_tolerance: float, + min_free_space: float, + max_moves: int, + max_disk_usage: float, + validate: bool, + prioritize_space: bool, + dry_run: bool, + auto_execute: bool, + node: Optional[str], +): """Generate shard movement recommendations for rebalancing""" - client = ctx.obj['client'] + client = ctx.obj["client"] analyzer = ShardAnalyzer(client) - + # Safety check for auto-execute if auto_execute and dry_run: console.print("[red]❌ Error: --auto-execute requires --execute flag[/red]") @@ -300,7 +330,12 @@ def recommend(ctx, table: Optional[str], min_size: float, max_size: float, return mode_text = "DRY RUN - Analysis Only" if dry_run else "EXECUTION MODE" - console.print(Panel.fit(f"[bold blue]Generating Rebalancing Recommendations[/bold blue] - [bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]")) + console.print( + Panel.fit( + f"[bold blue]Generating Rebalancing Recommendations[/bold blue] - " + f"[bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]" + ) + ) console.print("[dim]Note: Only analyzing healthy shards (STARTED + 100% recovered) for safe operations[/dim]") console.print("[dim]Zone conflict detection: Prevents moves that would violate CrateDB's zone awareness[/dim]") if prioritize_space: @@ -328,21 +363,21 @@ def recommend(ctx, table: Optional[str], min_size: float, max_size: float, max_recommendations=max_moves, prioritize_space=prioritize_space, source_node=node, - max_disk_usage_percent=max_disk_usage + max_disk_usage_percent=max_disk_usage, ) if not recommendations: if node: console.print(f"[yellow]No safe recommendations found for node '{node}'[/yellow]") - console.print(f"[dim]This could be due to:[/dim]") - console.print(f"[dim] β€’ Zone conflicts preventing safe moves[/dim]") + console.print("[dim]This could be due to:[/dim]") + console.print("[dim] β€’ Zone conflicts preventing safe moves[/dim]") console.print(f"[dim] β€’ Target nodes exceeding {max_disk_usage}% disk usage threshold[/dim]") console.print(f"[dim] β€’ Insufficient free space on target nodes (need {min_free_space}GB)[/dim]") console.print(f"[dim] β€’ No shards in size range {min_size}-{max_size}GB[/dim]") - console.print(f"[dim]Suggestions:[/dim]") - console.print(f"[dim] β€’ Try: --max-disk-usage 95 (allow higher disk usage)[/dim]") - console.print(f"[dim] β€’ Try: --min-free-space 50 (reduce space requirements)[/dim]") - console.print(f"[dim] β€’ Try: different size ranges or remove --node filter[/dim]") + console.print("[dim]Suggestions:[/dim]") + console.print("[dim] β€’ Try: --max-disk-usage 95 (allow higher disk usage)[/dim]") + console.print("[dim] β€’ Try: --min-free-space 50 (reduce space requirements)[/dim]") + console.print("[dim] β€’ Try: different size ranges or remove --node filter[/dim]") else: console.print("[green]No rebalancing recommendations needed. Cluster appears well balanced![/green]") return @@ -377,7 +412,7 @@ def recommend(ctx, table: Optional[str], min_size: float, max_size: float, format_size(target_free_space), zone_change, format_size(rec.size_gb), - rec.reason + rec.reason, ] if validate: @@ -420,14 +455,16 @@ def recommend(ctx, table: Optional[str], min_size: float, max_size: float, console.print(f"[dim] Target SQL: {rec.to_sql()}[/dim]") console.print() - console.print(f"[bold]Dry Run Summary:[/bold]") + console.print("[bold]Dry Run Summary:[/bold]") console.print(f" β€’ Safe moves that would execute: [green]{safe_moves}[/green]") console.print(f" β€’ Zone conflicts prevented: [yellow]{zone_conflicts}[/yellow]") console.print(f" β€’ Space-related issues: [yellow]{space_issues}[/yellow]") if safe_moves > 0: - console.print(f"\n[green]βœ“ Ready to execute {safe_moves} safe moves. Use --execute to generate SQL commands.[/green]") + console.print( + f"\n[green]βœ“ Ready to execute {safe_moves} safe moves. Use --execute to generate SQL commands.[/green]" + ) else: - console.print(f"\n[yellow]⚠ No safe moves identified. Review cluster balance or adjust parameters.[/yellow]") + console.print("\n[yellow]⚠ No safe moves identified. Review cluster balance or adjust parameters.[/yellow]") else: console.print(Panel.fit("[bold green]Generated SQL Commands[/bold green]")) console.print("[dim]# Copy and paste these commands to execute the moves[/dim]") @@ -445,7 +482,7 @@ def recommend(ctx, table: Optional[str], min_size: float, max_size: float, if "Zone conflict" in safety_msg: zone_conflicts += 1 console.print(f"-- Move {i}: SKIPPED - {safety_msg}") - console.print(f"-- Tip: Try moving to a different zone or check existing shard distribution") + console.print("-- Tip: Try moving to a different zone or check existing shard distribution") else: console.print(f"-- Move {i}: SKIPPED - {safety_msg}") continue @@ -462,12 +499,14 @@ def recommend(ctx, table: Optional[str], min_size: float, max_size: float, if validate and safe_moves < len(recommendations): if zone_conflicts > 0: console.print(f"[yellow]Warning: {zone_conflicts} moves skipped due to zone conflicts[/yellow]") - console.print(f"[yellow]Tip: Use 'find-candidates' to see current shard distribution across zones[/yellow]") - console.print(f"[yellow]Warning: Only {safe_moves} of {len(recommendations)} moves passed safety validation[/yellow]") + console.print("[yellow]Tip: Use 'find-candidates' to see current shard distribution across zones[/yellow]") + console.print( + f"[yellow]Warning: Only {safe_moves} of {len(recommendations)} moves passed safety validation[/yellow]" + ) @main.command() -@click.option('--connection-string', help='Override connection string from .env') +@click.option("--connection-string", help="Override connection string from .env") @click.pass_context def test_connection(ctx, connection_string: Optional[str]): """Test connection to CrateDB cluster""" @@ -495,12 +534,12 @@ def test_connection(ctx, connection_string: Optional[str]): @main.command() -@click.option('--table', '-t', help='Check balance for specific table only') -@click.option('--tolerance', default=10.0, help='Zone balance tolerance percentage (default: 10)') +@click.option("--table", "-t", help="Check balance for specific table only") +@click.option("--tolerance", default=10.0, help="Zone balance tolerance percentage (default: 10)") @click.pass_context def check_balance(ctx, table: Optional[str], tolerance: float): """Check zone balance for shards""" - client = ctx.obj['client'] + client = ctx.obj["client"] analyzer = ShardAnalyzer(client) console.print(Panel.fit("[bold blue]Zone Balance Check[/bold blue]")) @@ -514,13 +553,10 @@ def check_balance(ctx, table: Optional[str], tolerance: float): return # Calculate totals and targets - total_shards = sum(stats['TOTAL'] for stats in zone_stats.values()) + total_shards = sum(stats["TOTAL"] for stats in zone_stats.values()) zones = list(zone_stats.keys()) target_per_zone = total_shards // len(zones) if zones else 0 - tolerance_range = ( - target_per_zone * (1 - tolerance / 100), - target_per_zone * (1 + tolerance / 100) - ) + tolerance_range = (target_per_zone * (1 - tolerance / 100), target_per_zone * (1 + tolerance / 100)) balance_table = Table(title=f"Zone Balance Analysis (Target: {target_per_zone} Β±{tolerance}%)", box=box.ROUNDED) balance_table.add_column("Zone", style="cyan") @@ -530,7 +566,7 @@ def check_balance(ctx, table: Optional[str], tolerance: float): balance_table.add_column("Status", style="bold") for zone, stats in zone_stats.items(): - total = stats['TOTAL'] + total = stats["TOTAL"] if tolerance_range[0] <= total <= tolerance_range[1]: status = "[green]βœ“ Balanced[/green]" @@ -539,24 +575,18 @@ def check_balance(ctx, table: Optional[str], tolerance: float): else: status = f"[red]⚠ Over ({total - target_per_zone:+})[/red]" - balance_table.add_row( - zone, - str(stats['PRIMARY']), - str(stats['REPLICA']), - str(total), - status - ) + balance_table.add_row(zone, str(stats["PRIMARY"]), str(stats["REPLICA"]), str(total), status) console.print(balance_table) @main.command() -@click.option('--table', '-t', help='Analyze zones for specific table only') -@click.option('--show-shards/--no-show-shards', default=False, help='Show individual shard details (default: False)') +@click.option("--table", "-t", help="Analyze zones for specific table only") +@click.option("--show-shards/--no-show-shards", default=False, help="Show individual shard details (default: False)") @click.pass_context def zone_analysis(ctx, table: Optional[str], show_shards: bool): """Detailed analysis of zone distribution and potential conflicts""" - client = ctx.obj['client'] + client = ctx.obj["client"] console.print(Panel.fit("[bold blue]Detailed Zone Analysis[/bold blue]")) console.print("[dim]Comprehensive zone distribution analysis for CrateDB cluster[/dim]") @@ -626,23 +656,22 @@ def zone_analysis(ctx, table: Optional[str], show_shards: bool): replica_zones_str = ", ".join(sorted(replica_zones)) if replica_zones else "None" analysis_table.add_row( - str(shard_id), - primary_zone, - replica_zones_str, - str(total_copies), - " ".join(status_parts) + str(shard_id), primary_zone, replica_zones_str, str(total_copies), " ".join(status_parts) ) # Show individual shard details if requested if show_shards: for shard_copy in shard_copies: - health_indicator = "βœ“" if shard_copy.routing_state == 'STARTED' else "⚠" - console.print(f" {health_indicator} {shard_copy.shard_type} on {shard_copy.node_name} ({shard_copy.zone}) - {shard_copy.routing_state}") + health_indicator = "βœ“" if shard_copy.routing_state == "STARTED" else "⚠" + console.print( + f" {health_indicator} {shard_copy.shard_type} " + f"on {shard_copy.node_name} ({shard_copy.zone}) - {shard_copy.routing_state}" + ) console.print(analysis_table) # Summary - console.print(f"\n[bold]Zone Analysis Summary:[/bold]") + console.print("\n[bold]Zone Analysis Summary:[/bold]") console.print(f" β€’ Tables analyzed: [cyan]{len(tables)}[/cyan]") console.print(f" β€’ Zone conflicts detected: [red]{zone_conflicts}[/red]") console.print(f" β€’ Under-replicated shards: [yellow]{under_replicated}[/yellow]") @@ -660,131 +689,12 @@ def zone_analysis(ctx, table: Optional[str], show_shards: bool): console.print("\n[green]βœ“ No critical zone distribution issues detected![/green]") -# @main.command() -# @click.argument('node_name') -# @click.option('--min-free-space', default=100.0, help='Minimum free space required on target nodes in GB (default: 100)') -# @click.option('--dry-run/--execute', default=True, help='Show decommission plan without generating SQL commands (default: True)') -# @click.pass_context -# def decommission(ctx, node_name: str, min_free_space: float, dry_run: bool): -# """Plan decommissioning of a node by analyzing required shard moves -# -# NODE_NAME: Name of the node to decommission -# """ -# client = ctx.obj['client'] -# analyzer = ShardAnalyzer(client) -# -# mode_text = "PLANNING MODE" if dry_run else "EXECUTION MODE" -# console.print(Panel.fit(f"[bold blue]Node Decommission Analysis[/bold blue] - [bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]")) -# console.print(f"[dim]Analyzing decommission plan for node: {node_name}[/dim]") -# console.print() -# -# # Generate decommission plan -# plan = analyzer.plan_node_decommission(node_name, min_free_space) -# -# if 'error' in plan: -# console.print(f"[red]Error: {plan['error']}[/red]") -# return -# -# # Display plan summary -# summary_table = Table(title=f"Decommission Plan for {node_name}", box=box.ROUNDED) -# summary_table.add_column("Metric", style="cyan") -# summary_table.add_column("Value", style="magenta") -# -# summary_table.add_row("Node", plan['node']) -# summary_table.add_row("Zone", plan['zone']) -# summary_table.add_row("Feasible", "[green]βœ“ Yes[/green]" if plan['feasible'] else "[red]βœ— No[/red]") -# summary_table.add_row("Shards to Move", str(plan['shards_to_move'])) -# summary_table.add_row("Moveable Shards", str(plan['moveable_shards'])) -# summary_table.add_row("Total Data Size", format_size(plan['total_size_gb'])) -# summary_table.add_row("Estimated Time", f"{plan['estimated_time_hours']:.1f} hours") -# -# console.print(summary_table) -# console.print() -# -# # Show warnings if any -# if plan['warnings']: -# console.print("[bold yellow]⚠ Warnings:[/bold yellow]") -# for warning in plan['warnings']: -# console.print(f" β€’ [yellow]{warning}[/yellow]") -# console.print() -# -# # Show infeasible moves if any -# if plan['infeasible_moves']: -# console.print("[bold red]βœ— Cannot Move:[/bold red]") -# infeasible_table = Table(box=box.ROUNDED) -# infeasible_table.add_column("Shard", style="cyan") -# infeasible_table.add_column("Size", style="magenta") -# infeasible_table.add_column("Reason", style="red") -# -# for move in plan['infeasible_moves']: -# infeasible_table.add_row( -# move['shard'], -# format_size(move['size_gb']), -# move['reason'] -# ) -# console.print(infeasible_table) -# console.print() -# -# # Show move recommendations -# if plan['recommendations']: -# move_table = Table(title="Required Shard Moves", box=box.ROUNDED) -# move_table.add_column("Table", style="cyan") -# move_table.add_column("Shard", justify="right", style="magenta") -# move_table.add_column("Type", style="blue") -# move_table.add_column("Size", style="green") -# move_table.add_column("From Zone", style="yellow") -# move_table.add_column("To Node", style="cyan") -# move_table.add_column("To Zone", style="yellow") -# -# for rec in plan['recommendations']: -# move_table.add_row( -# f"{rec.schema_name}.{rec.table_name}", -# str(rec.shard_id), -# rec.shard_type, -# format_size(rec.size_gb), -# rec.from_zone, -# rec.to_node, -# rec.to_zone -# ) -# -# console.print(move_table) -# console.print() -# -# # Generate SQL commands if not in dry-run mode -# if not dry_run and plan['feasible']: -# console.print(Panel.fit("[bold green]Decommission SQL Commands[/bold green]")) -# console.print("[dim]# Execute these commands in order to prepare for node decommission[/dim]") -# console.print("[dim]# ALWAYS test in a non-production environment first![/dim]") -# console.print("[dim]# Monitor shard health after each move before proceeding[/dim]") -# console.print() -# -# for i, rec in enumerate(plan['recommendations'], 1): -# console.print(f"-- Move {i}: {rec.reason}") -# console.print(f"{rec.to_sql()}") -# console.print() -# -# console.print(f"-- After all moves complete, the node {node_name} can be safely removed") -# console.print(f"-- Total moves required: {len(plan['recommendations'])}") -# elif dry_run: -# console.print("[green]βœ“ Decommission plan ready. Use --execute to generate SQL commands.[/green]") -# -# # Final status -# if not plan['feasible']: -# console.print(f"[red]⚠ Node {node_name} cannot be safely decommissioned at this time.[/red]") -# console.print("[dim]Address the issues above before attempting decommission.[/dim]") -# elif plan['shards_to_move'] == 0: -# console.print(f"[green]βœ“ Node {node_name} is ready for immediate decommission (no shards to move).[/green]") -# else: -# console.print(f"[green]βœ“ Node {node_name} can be safely decommissioned after moving {len(plan['recommendations'])} shards.[/green]") - - @main.command() -@click.argument('schema_table') -@click.argument('shard_id', type=int) -@click.argument('from_node') -@click.argument('to_node') -@click.option('--max-disk-usage', default=90.0, help='Maximum disk usage percentage for target node (default: 90)') - +@click.argument("schema_table") +@click.argument("shard_id", type=int) +@click.argument("from_node") +@click.argument("to_node") +@click.option("--max-disk-usage", default=90.0, help="Maximum disk usage percentage for target node (default: 90)") @click.pass_context def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node: str, max_disk_usage: float): """Validate a specific shard move before execution @@ -796,17 +706,17 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node Example: xmover validate-move CUROV.maddoxxFormfactor 4 data-hot-1 data-hot-3 """ - client = ctx.obj['client'] + client = ctx.obj["client"] analyzer = ShardAnalyzer(client) # Parse schema and table - if '.' not in schema_table: + if "." not in schema_table: console.print("[red]Error: Schema and table must be in format 'schema.table'[/red]") return - schema_name, table_name = schema_table.split('.', 1) + schema_name, table_name = schema_table.split(".", 1) - console.print(Panel.fit(f"[bold blue]Validating Shard Move[/bold blue]")) + console.print(Panel.fit("[bold blue]Validating Shard Move[/bold blue]")) console.print(f"[dim]Move: {schema_name}.{table_name}[{shard_id}] from {from_node} to {to_node}[/dim]") console.print() @@ -830,16 +740,18 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node # Find the specific shard target_shard = None for shard in analyzer.shards: - if (shard.schema_name == schema_name and - shard.table_name == table_name and - shard.shard_id == shard_id and - shard.node_name == from_node): + if ( + shard.schema_name == schema_name + and shard.table_name == table_name + and shard.shard_id == shard_id + and shard.node_name == from_node + ): target_shard = shard break if not target_shard: console.print(f"[red]βœ— Shard {shard_id} not found on node {from_node}[/red]") - console.print(f"[dim]Use 'xmover find-candidates' to see available shards[/dim]") + console.print("[dim]Use 'xmover find-candidates' to see available shards[/dim]") return # Create a move recommendation for validation @@ -853,7 +765,7 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node to_zone=to_node_info.zone, shard_type=target_shard.shard_type, size_gb=target_shard.size_gb, - reason="Manual validation" + reason="Manual validation", ) # Display shard details @@ -890,7 +802,9 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node console.print(f"{recommendation.to_sql()}") console.print() console.print("[dim]# Monitor shard health after execution[/dim]") - console.print("[dim]# Check with: SELECT * FROM sys.shards WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]") + console.print( + "[dim]# Check with: SELECT * FROM sys.shards WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]" + ) else: console.print("[red]βœ— VALIDATION FAILED - Move not safe[/red]") console.print(f"[red]βœ— {safety_msg}[/red]") @@ -921,7 +835,7 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node @main.command() -@click.argument('error_message', required=False) +@click.argument("error_message", required=False) @click.pass_context def explain_error(ctx, error_message: Optional[str]): """Explain CrateDB allocation error messages and provide solutions @@ -951,7 +865,7 @@ def explain_error(ctx, error_message: Optional[str]): console.print("[yellow]No error message provided[/yellow]") return - console.print(f"[dim]Analyzing error message...[/dim]") + console.print("[dim]Analyzing error message...[/dim]") console.print() # Common CrateDB allocation error patterns and solutions @@ -963,9 +877,9 @@ def explain_error(ctx, error_message: Optional[str]): "solutions": [ "Choose a different target node that doesn't have this shard", "Use 'xmover zone-analysis --show-shards' to see current distribution", - "Verify the shard ID and table name are correct" + "Verify the shard ID and table name are correct", ], - "prevention": "Always check current shard locations before moving" + "prevention": "Always check current shard locations before moving", }, { "pattern": "there are too many copies of the shard allocated to nodes with attribute", @@ -974,9 +888,9 @@ def explain_error(ctx, error_message: Optional[str]): "solutions": [ "Move the shard to a different availability zone", "Check zone balance with 'xmover check-balance'", - "Ensure target zone doesn't already have copies of this shard" + "Ensure target zone doesn't already have copies of this shard", ], - "prevention": "Use 'xmover recommend' which respects zone constraints" + "prevention": "Use 'xmover recommend' which respects zone constraints", }, { "pattern": "not enough disk space", @@ -985,9 +899,9 @@ def explain_error(ctx, error_message: Optional[str]): "solutions": [ "Free up space on the target node", "Choose a node with more available capacity", - "Check available space with 'xmover analyze'" + "Check available space with 'xmover analyze'", ], - "prevention": "Use '--min-free-space' parameter in recommendations" + "prevention": "Use '--min-free-space' parameter in recommendations", }, { "pattern": "shard recovery limit", @@ -996,21 +910,22 @@ def explain_error(ctx, error_message: Optional[str]): "solutions": [ "Wait for current recoveries to complete", "Check recovery status in CrateDB admin UI", - "Reduce concurrent recoveries in cluster settings" + "Reduce concurrent recoveries in cluster settings", ], - "prevention": "Move shards gradually, monitor recovery progress" + "prevention": "Move shards gradually, monitor recovery progress", }, { "pattern": "allocation is disabled", "title": "Allocation Disabled", "explanation": "Shard allocation is temporarily disabled in the cluster.", "solutions": [ - "Re-enable allocation: PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":\"all\"}}", + "Re-enable allocation: PUT /_cluster/settings " + '{"persistent":{"cluster.routing.allocation.enable":"all"}}', "Check if allocation was disabled for maintenance", - "Verify cluster health before re-enabling" + "Verify cluster health before re-enabling", ], - "prevention": "Check allocation status before performing moves" - } + "prevention": "Check allocation status before performing moves", + }, ] # Find matching patterns @@ -1031,7 +946,7 @@ def explain_error(ctx, error_message: Optional[str]): console.print() console.print("[green]πŸ’‘ Solutions:[/green]") - for j, solution in enumerate(match['solutions'], 1): + for j, solution in enumerate(match["solutions"], 1): console.print(f" {j}. {solution}") console.print() @@ -1041,25 +956,33 @@ def explain_error(ctx, error_message: Optional[str]): console.print() console.print("[bold]General Troubleshooting Steps:[/bold]") console.print("1. Check current shard distribution: [cyan]xmover analyze[/cyan]") - console.print("2. Validate the specific move: [cyan]xmover validate-move schema.table shard_id from_node to_node[/cyan]") + console.print( + "2. Validate the specific move: [cyan]xmover validate-move schema.table shard_id from_node to_node[/cyan]" + ) console.print("3. Check zone conflicts: [cyan]xmover zone-analysis --show-shards[/cyan]") console.print("4. Verify node capacity: [cyan]xmover analyze[/cyan]") console.print("5. Review CrateDB documentation on shard allocation") console.print() console.print("[dim]πŸ’‘ Tip: Use 'xmover validate-move' to check moves before execution[/dim]") - console.print("[dim]πŸ“š For more help: https://crate.io/docs/crate/reference/en/latest/admin/system-information.html[/dim]") + console.print( + "[dim]πŸ“š For more help: https://crate.io/docs/crate/reference/en/latest/admin/system-information.html[/dim]" + ) @main.command() -@click.option('--table', '-t', help='Monitor recovery for specific table only') -@click.option('--node', '-n', help='Monitor recovery on specific node only') -@click.option('--watch', '-w', is_flag=True, help='Continuously monitor (refresh every 10s)') -@click.option('--refresh-interval', default=10, help='Refresh interval for watch mode (seconds)') -@click.option('--recovery-type', type=click.Choice(['PEER', 'DISK', 'all']), default='all', help='Filter by recovery type') -@click.option('--include-transitioning', is_flag=True, help='Include completed recoveries still in transitioning state') +@click.option("--table", "-t", help="Monitor recovery for specific table only") +@click.option("--node", "-n", help="Monitor recovery on specific node only") +@click.option("--watch", "-w", is_flag=True, help="Continuously monitor (refresh every 10s)") +@click.option("--refresh-interval", default=10, help="Refresh interval for watch mode (seconds)") +@click.option( + "--recovery-type", type=click.Choice(["PEER", "DISK", "all"]), default="all", help="Filter by recovery type" +) +@click.option("--include-transitioning", is_flag=True, help="Include completed recoveries still in transitioning state") @click.pass_context -def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: int, recovery_type: str, include_transitioning: bool): +def monitor_recovery( + ctx, table: str, node: str, watch: bool, refresh_interval: int, recovery_type: str, include_transitioning: bool +): """Monitor active shard recovery operations on the cluster This command monitors ongoing shard recoveries by querying sys.allocations @@ -1076,11 +999,10 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: xmover monitor-recovery --recovery-type PEER # Only PEER recoveries """ try: - client = ctx.obj['client'] + client = ctx.obj["client"] recovery_monitor = RecoveryMonitor(client) if watch: - console.print(f"πŸ”„ Monitoring shard recoveries (refreshing every {refresh_interval}s)") console.print("Press Ctrl+C to stop") console.print() @@ -1101,11 +1023,12 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: table_name=table, node_name=node, recovery_type_filter=recovery_type, - include_transitioning=include_transitioning + include_transitioning=include_transitioning, ) # Display current time from datetime import datetime + current_time = datetime.now().strftime("%H:%M:%S") # Check for any changes @@ -1114,7 +1037,9 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: completed_count = 0 for recovery in recoveries: - recovery_key = f"{recovery.schema_name}.{recovery.table_name}.{recovery.shard_id}.{recovery.node_name}" + recovery_key = ( + f"{recovery.schema_name}.{recovery.table_name}.{recovery.shard_id}.{recovery.node_name}" + ) # Create complete table name if recovery.schema_name == "doc": @@ -1131,8 +1056,8 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: # Check for changes since last update if recovery_key in previous_recoveries: prev = previous_recoveries[recovery_key] - if prev['progress'] != recovery.overall_progress: - diff = recovery.overall_progress - prev['progress'] + if prev["progress"] != recovery.overall_progress: + diff = recovery.overall_progress - prev["progress"] # Create node route display node_route = "" if recovery.recovery_type == "PEER" and recovery.source_node_name: @@ -1142,12 +1067,20 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: # Add translog info translog_info = format_translog_info(recovery) - + if diff > 0: - changes.append(f"[green]πŸ“ˆ[/green] {table_display} S{recovery.shard_id} {recovery.overall_progress:.1f}% (+{diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}") + changes.append( + f"[green]πŸ“ˆ[/green] {table_display} S{recovery.shard_id} " + f"{recovery.overall_progress:.1f}% (+{diff:.1f}%) " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) else: - changes.append(f"[yellow]πŸ“‰[/yellow] {table_display} S{recovery.shard_id} {recovery.overall_progress:.1f}% ({diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}") - elif prev['stage'] != recovery.stage: + changes.append( + f"[yellow]πŸ“‰[/yellow] {table_display} S{recovery.shard_id} " + f"{recovery.overall_progress:.1f}% ({diff:.1f}%) " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) + elif prev["stage"] != recovery.stage: # Create node route display node_route = "" if recovery.recovery_type == "PEER" and recovery.source_node_name: @@ -1157,11 +1090,19 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: # Add translog info translog_info = format_translog_info(recovery) - - changes.append(f"[blue]πŸ”„[/blue] {table_display} S{recovery.shard_id} {prev['stage']}β†’{recovery.stage} {recovery.size_gb:.1f}GB{translog_info}{node_route}") + + changes.append( + f"[blue]πŸ”„[/blue] {table_display} S{recovery.shard_id} " + f"{prev['stage']}β†’{recovery.stage} " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) else: # New recovery - show based on include_transitioning flag or first run - if first_run or include_transitioning or (recovery.overall_progress < 100.0 or recovery.stage != "DONE"): + if ( + first_run + or include_transitioning + or (recovery.overall_progress < 100.0 or recovery.stage != "DONE") + ): # Create node route display node_route = "" if recovery.recovery_type == "PEER" and recovery.source_node_name: @@ -1172,13 +1113,17 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: status_icon = "[cyan]πŸ†•[/cyan]" if not first_run else "[blue]πŸ“‹[/blue]" # Add translog info translog_info = format_translog_info(recovery) - - changes.append(f"{status_icon} {table_display} S{recovery.shard_id} {recovery.stage} {recovery.overall_progress:.1f}% {recovery.size_gb:.1f}GB{translog_info}{node_route}") + + changes.append( + f"{status_icon} {table_display} S{recovery.shard_id} " + f"{recovery.stage} {recovery.overall_progress:.1f}% " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) # Store current state for next comparison previous_recoveries[recovery_key] = { - 'progress': recovery.overall_progress, - 'stage': recovery.stage + "progress": recovery.overall_progress, + "stage": recovery.stage, } # Always show a status line @@ -1205,7 +1150,7 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: elif active_count > 0: console.print(f"{current_time} | {status} (no changes)") - previous_timestamp = current_time + previous_timestamp = current_time # noqa: F841 first_run = False time.sleep(refresh_interval) @@ -1217,7 +1162,7 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: table_name=table, node_name=node, recovery_type_filter=recovery_type, - include_transitioning=include_transitioning + include_transitioning=include_transitioning, ) if final_recoveries: @@ -1233,10 +1178,13 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: console.print(f" Total size: {summary['total_size_gb']:.1f} GB") console.print(f" Average progress: {summary['avg_progress']:.1f}%") - if summary['by_type']: - console.print(f" By recovery type:") - for rec_type, stats in summary['by_type'].items(): - console.print(f" {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress") + if summary["by_type"]: + console.print(" By recovery type:") + for rec_type, stats in summary["by_type"].items(): + console.print( + f" {rec_type}: {stats['count']} recoveries, " + f"{stats['avg_progress']:.1f}% avg progress" + ) else: console.print("\n[green]βœ… No active recoveries at exit[/green]") @@ -1248,7 +1196,7 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: table_name=table, node_name=node, recovery_type_filter=recovery_type, - include_transitioning=include_transitioning + include_transitioning=include_transitioning, ) display_output = recovery_monitor.format_recovery_display(recoveries) @@ -1259,62 +1207,65 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval: console.print("\n[green]βœ… No recoveries found (active or transitioning)[/green]") else: console.print("\n[green]βœ… No active recoveries found[/green]") - console.print("[dim]πŸ’‘ Use --include-transitioning to see completed recoveries still transitioning[/dim]") + console.print( + "[dim]πŸ’‘ Use --include-transitioning to see completed recoveries still transitioning[/dim]" + ) else: # Show summary summary = recovery_monitor.get_recovery_summary(recoveries) - console.print(f"\nπŸ“Š [bold]Recovery Summary:[/bold]") + console.print("\nπŸ“Š [bold]Recovery Summary:[/bold]") console.print(f" Total recoveries: {summary['total_recoveries']}") console.print(f" Total size: {summary['total_size_gb']:.1f} GB") console.print(f" Average progress: {summary['avg_progress']:.1f}%") # Show breakdown by type - if summary['by_type']: - console.print(f"\n By recovery type:") - for rec_type, stats in summary['by_type'].items(): - console.print(f" {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress") + if summary["by_type"]: + console.print("\n By recovery type:") + for rec_type, stats in summary["by_type"].items(): + console.print( + f" {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress" + ) - console.print(f"\n[dim]πŸ’‘ Use --watch flag for continuous monitoring[/dim]") + console.print("\n[dim]πŸ’‘ Use --watch flag for continuous monitoring[/dim]") except Exception as e: console.print(f"[red]❌ Error monitoring recoveries: {e}[/red]") - if ctx.obj.get('debug'): + if ctx.obj.get("debug"): raise def _wait_for_recovery_capacity(client, max_concurrent_recoveries: int = 5): """Wait until active recovery count is below threshold""" - from xmover.analyzer import RecoveryMonitor from time import sleep - + + from .analyzer import RecoveryMonitor + recovery_monitor = RecoveryMonitor(client) wait_time = 0 - + while True: # Check active recoveries (including transitioning) recoveries = recovery_monitor.get_cluster_recovery_status(include_transitioning=True) active_count = len([r for r in recoveries if r.overall_progress < 100.0 or r.stage != "DONE"]) - + status = f"{active_count}/{max_concurrent_recoveries}" if active_count < max_concurrent_recoveries: if wait_time > 0: - console.print(f" [green]βœ“ Recovery capacity available ({active_count}/{max_concurrent_recoveries} active)[/green]") + console.print(f" [green]βœ“ Recovery capacity available ({status} active)[/green]") break - else: - if wait_time == 0: - console.print(f" [yellow]⏳ Waiting for recovery capacity... ({active_count}/{max_concurrent_recoveries} active)[/yellow]") - elif wait_time % 30 == 0: # Update every 30 seconds - console.print(f" [yellow]⏳ Still waiting... ({active_count}/{max_concurrent_recoveries} active)[/yellow]") - - sleep(10) # Check every 10 seconds - wait_time += 10 + if wait_time == 0: + console.print(f" [yellow]⏳ Waiting for recovery capacity... ({status} active)[/yellow]") + elif wait_time % 30 == 0: # Update every 30 seconds + console.print(f" [yellow]⏳ Still waiting... ({status} active)[/yellow]") + + sleep(10) # Check every 10 seconds + wait_time += 10 def _execute_recommendations_safely(client, recommendations, validate: bool): """Execute recommendations with extensive safety measures""" - from time import sleep - import sys - from xmover.analyzer import ShardAnalyzer - + + from .analyzer import ShardAnalyzer + # Filter to only safe recommendations safe_recommendations = [] if validate: @@ -1325,20 +1276,20 @@ def _execute_recommendations_safely(client, recommendations, validate: bool): safe_recommendations.append(rec) else: safe_recommendations = recommendations - + if not safe_recommendations: console.print("[yellow]⚠ No safe recommendations to execute[/yellow]") return - - console.print(f"\n[bold red]🚨 AUTO-EXECUTION MODE 🚨[/bold red]") + + console.print("\n[bold red]🚨 AUTO-EXECUTION MODE 🚨[/bold red]") console.print(f"About to execute {len(safe_recommendations)} shard moves automatically:") console.print() - + # Show what will be executed for i, rec in enumerate(safe_recommendations, 1): table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name console.print(f" {i}. {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB) {rec.from_node} β†’ {rec.to_node}") - + console.print() console.print("[bold yellow]⚠ SAFETY WARNINGS:[/bold yellow]") console.print(" β€’ These commands will immediately start shard movements") @@ -1346,74 +1297,80 @@ def _execute_recommendations_safely(client, recommendations, validate: bool): console.print(" β€’ Recovery time depends on shard size and network speed") console.print(" β€’ You should monitor progress with: xmover monitor-recovery --watch") console.print() - + # Double confirmation try: response1 = input("Type 'EXECUTE' to proceed with automatic execution: ").strip() if response1 != "EXECUTE": console.print("[yellow]❌ Execution cancelled[/yellow]") return - + response2 = input(f"Confirm: Execute {len(safe_recommendations)} shard moves? (yes/no): ").strip().lower() - if response2 not in ['yes', 'y']: + if response2 not in ["yes", "y"]: console.print("[yellow]❌ Execution cancelled[/yellow]") return - + except KeyboardInterrupt: console.print("\n[yellow]❌ Execution cancelled by user[/yellow]") return - + console.print(f"\nπŸš€ [bold green]Executing {len(safe_recommendations)} shard moves...[/bold green]") console.print() - + successful_moves = 0 failed_moves = 0 - + for i, rec in enumerate(safe_recommendations, 1): table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name sql_command = rec.to_sql() - - console.print(f"[{i}/{len(safe_recommendations)}] Executing: {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB)") + + console.print( + f"[{i}/{len(safe_recommendations)}] Executing: {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB)" + ) console.print(f" {rec.from_node} β†’ {rec.to_node}") - + try: # Execute the SQL command result = client.execute_query(sql_command) - - if result.get('rowcount', 0) >= 0: # Success indicator for ALTER statements - console.print(f" [green]βœ… SUCCESS[/green] - Move initiated") + + if result.get("rowcount", 0) >= 0: # Success indicator for ALTER statements + console.print(" [green]βœ… SUCCESS[/green] - Move initiated") successful_moves += 1 - + # Smart delay: check active recoveries before next move if i < len(safe_recommendations): _wait_for_recovery_capacity(client, max_concurrent_recoveries=5) else: console.print(f" [red]❌ FAILED[/red] - Unexpected result: {result}") failed_moves += 1 - + except Exception as e: console.print(f" [red]❌ FAILED[/red] - Error: {e}") failed_moves += 1 - + # Ask whether to continue after a failure if i < len(safe_recommendations): try: - continue_response = input(f" Continue with remaining {len(safe_recommendations) - i} moves? (yes/no): ").strip().lower() - if continue_response not in ['yes', 'y']: + continue_response = ( + input(f" Continue with remaining {len(safe_recommendations) - i} moves? (yes/no): ") + .strip() + .lower() + ) + if continue_response not in ["yes", "y"]: console.print("[yellow]⏹ Execution stopped by user[/yellow]") break except KeyboardInterrupt: console.print("\n[yellow]⏹ Execution stopped by user[/yellow]") break - + console.print() - + # Final summary - console.print(f"πŸ“Š [bold]Execution Summary:[/bold]") + console.print("πŸ“Š [bold]Execution Summary:[/bold]") console.print(f" Successful moves: [green]{successful_moves}[/green]") console.print(f" Failed moves: [red]{failed_moves}[/red]") console.print(f" Total attempted: {successful_moves + failed_moves}") - + if successful_moves > 0: console.print() console.print("[green]βœ… Shard moves initiated successfully![/green]") @@ -1421,11 +1378,11 @@ def _execute_recommendations_safely(client, recommendations, validate: bool): console.print("[dim] xmover monitor-recovery --watch[/dim]") console.print("[dim]πŸ’‘ Check cluster status with:[/dim]") console.print("[dim] xmover analyze[/dim]") - + if failed_moves > 0: console.print() console.print(f"[yellow]⚠ {failed_moves} moves failed - check cluster status and retry if needed[/yellow]") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/cratedb_toolkit/admin/xmover/database.py b/cratedb_toolkit/admin/xmover/database.py index ec3a0098..a6e2d35f 100644 --- a/cratedb_toolkit/admin/xmover/database.py +++ b/cratedb_toolkit/admin/xmover/database.py @@ -2,17 +2,21 @@ Database connection and query functions for CrateDB """ +import logging import os -import json -import requests -from typing import Dict, List, Optional, Any from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import requests from dotenv import load_dotenv +logger = logging.getLogger(__name__) + @dataclass class NodeInfo: """Information about a CrateDB node""" + id: str name: str zone: str @@ -21,15 +25,15 @@ class NodeInfo: fs_total: int fs_used: int fs_available: int - + @property def heap_usage_percent(self) -> float: return (self.heap_used / self.heap_max) * 100 if self.heap_max > 0 else 0 - + @property def disk_usage_percent(self) -> float: return (self.fs_used / self.fs_total) * 100 if self.fs_total > 0 else 0 - + @property def available_space_gb(self) -> float: return self.fs_available / (1024**3) @@ -38,6 +42,7 @@ def available_space_gb(self) -> float: @dataclass class ShardInfo: """Information about a shard""" + table_name: str schema_name: str shard_id: int @@ -50,7 +55,7 @@ class ShardInfo: num_docs: int state: str routing_state: str - + @property def shard_type(self) -> str: return "PRIMARY" if self.is_primary else "REPLICA" @@ -59,6 +64,7 @@ def shard_type(self) -> str: @dataclass class RecoveryInfo: """Information about an active shard recovery""" + schema_name: str table_name: str shard_id: int @@ -75,31 +81,31 @@ class RecoveryInfo: size_bytes: int source_node_name: Optional[str] = None # Source node for PEER recoveries translog_size_bytes: int = 0 # Translog size in bytes - + @property def overall_progress(self) -> float: """Calculate overall progress percentage""" return max(self.files_percent, self.bytes_percent) - + @property def size_gb(self) -> float: """Size in GB""" return self.size_bytes / (1024**3) - + @property def shard_type(self) -> str: return "PRIMARY" if self.is_primary else "REPLICA" - + @property def total_time_seconds(self) -> float: """Total time in seconds""" return self.total_time_ms / 1000.0 - + @property def translog_size_gb(self) -> float: """Translog size in GB""" return self.translog_size_bytes / (1024**3) - + @property def translog_percentage(self) -> float: """Translog size as percentage of shard size""" @@ -108,52 +114,46 @@ def translog_percentage(self) -> float: class CrateDBClient: """Client for connecting to CrateDB and executing queries""" - + def __init__(self, connection_string: Optional[str] = None): load_dotenv() - - self.connection_string = connection_string or os.getenv('CRATE_CONNECTION_STRING') + + self.connection_string = connection_string or os.getenv("CRATE_CONNECTION_STRING") if not self.connection_string: raise ValueError("CRATE_CONNECTION_STRING not found in environment or provided") - - self.username = os.getenv('CRATE_USERNAME') - self.password = os.getenv('CRATE_PASSWORD') - self.ssl_verify = os.getenv('CRATE_SSL_VERIFY', 'true').lower() == 'true' - + + self.username = os.getenv("CRATE_USERNAME") + self.password = os.getenv("CRATE_PASSWORD") + self.ssl_verify = os.getenv("CRATE_SSL_VERIFY", "true").lower() == "true" + # Ensure connection string ends with _sql endpoint - if not self.connection_string.endswith('/_sql'): - self.connection_string = self.connection_string.rstrip('/') + '/_sql' - + if not self.connection_string.endswith("/_sql"): + self.connection_string = self.connection_string.rstrip("/") + "/_sql" + def execute_query(self, query: str, parameters: Optional[List] = None) -> Dict[str, Any]: """Execute a SQL query against CrateDB""" - payload = { - 'stmt': query - } - + payload = {"stmt": query} + if parameters: - payload['args'] = parameters - + payload["args"] = parameters + auth = None if self.username and self.password: auth = (self.username, self.password) - + try: response = requests.post( - self.connection_string, - json=payload, - auth=auth, - verify=self.ssl_verify, - timeout=30 + self.connection_string, json=payload, auth=auth, verify=self.ssl_verify, timeout=30 ) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: - raise Exception(f"Failed to execute query: {e}") - + raise Exception(f"Failed to execute query: {e}") from e + def get_nodes_info(self) -> List[NodeInfo]: """Get information about all nodes in the cluster""" query = """ - SELECT + SELECT id, name, attributes['zone'] as zone, @@ -166,30 +166,35 @@ def get_nodes_info(self) -> List[NodeInfo]: WHERE name IS NOT NULL ORDER BY name """ - + result = self.execute_query(query) nodes = [] - - for row in result.get('rows', []): - nodes.append(NodeInfo( - id=row[0], - name=row[1], - zone=row[2] or 'unknown', - heap_used=row[3] or 0, - heap_max=row[4] or 0, - fs_total=row[5] or 0, - fs_used=row[6] or 0, - fs_available=row[7] or 0 - )) - + + for row in result.get("rows", []): + nodes.append( + NodeInfo( + id=row[0], + name=row[1], + zone=row[2] or "unknown", + heap_used=row[3] or 0, + heap_max=row[4] or 0, + fs_total=row[5] or 0, + fs_used=row[6] or 0, + fs_available=row[7] or 0, + ) + ) + return nodes - - def get_shards_info(self, table_name: Optional[str] = None, - min_size_gb: Optional[float] = None, - max_size_gb: Optional[float] = None, - for_analysis: bool = False) -> List[ShardInfo]: + + def get_shards_info( + self, + table_name: Optional[str] = None, + min_size_gb: Optional[float] = None, + max_size_gb: Optional[float] = None, + for_analysis: bool = False, + ) -> List[ShardInfo]: """Get information about shards, optionally filtered by table and size - + Args: table_name: Filter by specific table min_size_gb: Minimum shard size in GB @@ -197,34 +202,31 @@ def get_shards_info(self, table_name: Optional[str] = None, for_analysis: If True, includes all shards regardless of state (for cluster analysis) If False, only includes healthy shards suitable for operations """ - + where_conditions = [] if not for_analysis: # For operations, only include healthy shards - where_conditions.extend([ - "s.routing_state = 'STARTED'", - "s.recovery['files']['percent'] = 100.0" - ]) + where_conditions.extend(["s.routing_state = 'STARTED'", "s.recovery['files']['percent'] = 100.0"]) parameters = [] - + if table_name: where_conditions.append("s.table_name = ?") parameters.append(table_name) - + if min_size_gb is not None: where_conditions.append("s.size >= ?") parameters.append(int(min_size_gb * 1024**3)) # Convert GB to bytes - + if max_size_gb is not None: where_conditions.append("s.size <= ?") parameters.append(int(max_size_gb * 1024**3)) # Convert GB to bytes - + where_clause = "" if where_conditions: where_clause = f"WHERE {' AND '.join(where_conditions)}" - + query = f""" - SELECT + SELECT s.table_name, s.schema_name, s.id as shard_id, @@ -241,32 +243,34 @@ def get_shards_info(self, table_name: Optional[str] = None, JOIN sys.nodes n ON s.node['id'] = n.id {where_clause} ORDER BY s.table_name, s.schema_name, s.id, s."primary" DESC - """ - + """ # noqa: S608 + result = self.execute_query(query, parameters) shards = [] - - for row in result.get('rows', []): - shards.append(ShardInfo( - table_name=row[0], - schema_name=row[1], - shard_id=row[2], - node_id=row[3], - node_name=row[4], - zone=row[5] or 'unknown', - is_primary=row[6], - size_bytes=row[7] or 0, - size_gb=float(row[8] or 0), - num_docs=row[9] or 0, - state=row[10], - routing_state=row[11] - )) - + + for row in result.get("rows", []): + shards.append( + ShardInfo( + table_name=row[0], + schema_name=row[1], + shard_id=row[2], + node_id=row[3], + node_name=row[4], + zone=row[5] or "unknown", + is_primary=row[6], + size_bytes=row[7] or 0, + size_gb=float(row[8] or 0), + num_docs=row[9] or 0, + state=row[10], + routing_state=row[11], + ) + ) + return shards - + def get_shard_distribution_summary(self, for_analysis: bool = True) -> Dict[str, Any]: """Get a summary of shard distribution across nodes and zones - + Args: for_analysis: If True, includes all shards for complete cluster analysis If False, only includes operational shards @@ -276,9 +280,9 @@ def get_shard_distribution_summary(self, for_analysis: bool = True) -> Dict[str, where_clause = """ WHERE s.routing_state = 'STARTED' AND s.recovery['files']['percent'] = 100.0""" - + query = f""" - SELECT + SELECT n.attributes['zone'] as zone, s.node['name'] as node_name, CASE WHEN s."primary" = true THEN 'PRIMARY' ELSE 'REPLICA' END as shard_type, @@ -289,98 +293,90 @@ def get_shard_distribution_summary(self, for_analysis: bool = True) -> Dict[str, JOIN sys.nodes n ON s.node['id'] = n.id{where_clause} GROUP BY n.attributes['zone'], s.node['name'], s."primary" ORDER BY zone, node_name, shard_type DESC - """ - + """ # noqa: S608 + result = self.execute_query(query) - - summary = { - 'by_zone': {}, - 'by_node': {}, - 'totals': {'primary': 0, 'replica': 0, 'total_size_gb': 0} - } - - for row in result.get('rows', []): - zone = row[0] or 'unknown' + + summary = {"by_zone": {}, "by_node": {}, "totals": {"primary": 0, "replica": 0, "total_size_gb": 0}} + + for row in result.get("rows", []): + zone = row[0] or "unknown" node_name = row[1] shard_type = row[2] shard_count = row[3] total_size_gb = float(row[4] or 0) - avg_size_gb = float(row[5] or 0) - + avg_size_gb = float(row[5] or 0) # noqa: F841 + # By zone summary - if zone not in summary['by_zone']: - summary['by_zone'][zone] = {'PRIMARY': 0, 'REPLICA': 0, 'total_size_gb': 0} - summary['by_zone'][zone][shard_type] += shard_count - summary['by_zone'][zone]['total_size_gb'] += total_size_gb - + if zone not in summary["by_zone"]: + summary["by_zone"][zone] = {"PRIMARY": 0, "REPLICA": 0, "total_size_gb": 0} + summary["by_zone"][zone][shard_type] += shard_count + summary["by_zone"][zone]["total_size_gb"] += total_size_gb + # By node summary - if node_name not in summary['by_node']: - summary['by_node'][node_name] = { - 'zone': zone, - 'PRIMARY': 0, - 'REPLICA': 0, - 'total_size_gb': 0 - } - summary['by_node'][node_name][shard_type] += shard_count - summary['by_node'][node_name]['total_size_gb'] += total_size_gb - + if node_name not in summary["by_node"]: + summary["by_node"][node_name] = {"zone": zone, "PRIMARY": 0, "REPLICA": 0, "total_size_gb": 0} + summary["by_node"][node_name][shard_type] += shard_count + summary["by_node"][node_name]["total_size_gb"] += total_size_gb + # Overall totals - if shard_type == 'PRIMARY': - summary['totals']['primary'] += shard_count + if shard_type == "PRIMARY": + summary["totals"]["primary"] += shard_count else: - summary['totals']['replica'] += shard_count - summary['totals']['total_size_gb'] += total_size_gb - + summary["totals"]["replica"] += shard_count + summary["totals"]["total_size_gb"] += total_size_gb + return summary - + def test_connection(self) -> bool: """Test the connection to CrateDB""" try: result = self.execute_query("SELECT 1") - return result.get('rowcount', 0) >= 0 + return result.get("rowcount", 0) >= 0 except Exception: return False - + def get_cluster_watermarks(self) -> Dict[str, Any]: """Get cluster disk watermark settings""" query = """ SELECT settings['cluster']['routing']['allocation']['disk']['watermark'] FROM sys.cluster """ - + try: result = self.execute_query(query) - if result.get('rows'): - watermarks = result['rows'][0][0] or {} + if result.get("rows"): + watermarks = result["rows"][0][0] or {} return { - 'low': watermarks.get('low', 'Not set'), - 'high': watermarks.get('high', 'Not set'), - 'flood_stage': watermarks.get('flood_stage', 'Not set'), - 'enable_for_single_data_node': watermarks.get('enable_for_single_data_node', 'Not set') + "low": watermarks.get("low", "Not set"), + "high": watermarks.get("high", "Not set"), + "flood_stage": watermarks.get("flood_stage", "Not set"), + "enable_for_single_data_node": watermarks.get("enable_for_single_data_node", "Not set"), } return {} except Exception: return {} - - def get_active_recoveries(self, table_name: Optional[str] = None, - node_name: Optional[str] = None) -> List[Dict[str, Any]]: + + def get_active_recoveries( + self, table_name: Optional[str] = None, node_name: Optional[str] = None + ) -> List[Dict[str, Any]]: """Get shards that are currently in recovery states from sys.allocations""" - + where_conditions = ["current_state != 'STARTED'"] parameters = [] - + if table_name: where_conditions.append("table_name = ?") parameters.append(table_name) - + if node_name: where_conditions.append("node_id = (SELECT id FROM sys.nodes WHERE name = ?)") parameters.append(node_name) - + where_clause = f"WHERE {' AND '.join(where_conditions)}" - + query = f""" - SELECT + SELECT table_name, shard_id, current_state, @@ -389,29 +385,31 @@ def get_active_recoveries(self, table_name: Optional[str] = None, FROM sys.allocations {where_clause} ORDER BY current_state, table_name, shard_id - """ - + """ # noqa: S608 + result = self.execute_query(query, parameters) - + allocations = [] - for row in result.get('rows', []): - allocations.append({ - 'schema_name': 'doc', # Default schema since not available in sys.allocations - 'table_name': row[0], - 'shard_id': row[1], - 'current_state': row[2], - 'explanation': row[3], - 'node_id': row[4] - }) - + for row in result.get("rows", []): + allocations.append( + { + "schema_name": "doc", # Default schema since not available in sys.allocations + "table_name": row[0], + "shard_id": row[1], + "current_state": row[2], + "explanation": row[3], + "node_id": row[4], + } + ) + return allocations - + def get_recovery_details(self, schema_name: str, table_name: str, shard_id: int) -> Optional[Dict[str, Any]]: """Get detailed recovery information for a specific shard from sys.shards""" - + # Query for shards that are actively recovering (not completed) query = """ - SELECT + SELECT s.table_name, s.schema_name, s.id as shard_id, @@ -429,117 +427,118 @@ def get_recovery_details(self, schema_name: str, table_name: str, shard_id: int) ORDER BY s.schema_name LIMIT 1 """ - + result = self.execute_query(query, [table_name, shard_id]) - - if not result.get('rows'): + + if not result.get("rows"): return None - - row = result['rows'][0] + + row = result["rows"][0] return { - 'table_name': row[0], - 'schema_name': row[1], - 'shard_id': row[2], - 'node_name': row[3], - 'node_id': row[4], - 'routing_state': row[5], - 'state': row[6], - 'recovery': row[7], - 'size': row[8], - 'primary': row[9], - 'translog_size': row[10] or 0 + "table_name": row[0], + "schema_name": row[1], + "shard_id": row[2], + "node_name": row[3], + "node_id": row[4], + "routing_state": row[5], + "state": row[6], + "recovery": row[7], + "size": row[8], + "primary": row[9], + "translog_size": row[10] or 0, } - - def get_all_recovering_shards(self, table_name: Optional[str] = None, - node_name: Optional[str] = None, - include_transitioning: bool = False) -> List[RecoveryInfo]: + + def get_all_recovering_shards( + self, table_name: Optional[str] = None, node_name: Optional[str] = None, include_transitioning: bool = False + ) -> List[RecoveryInfo]: """Get comprehensive recovery information by combining sys.allocations and sys.shards data""" - + # Step 1: Get active recoveries from allocations (efficient) active_allocations = self.get_active_recoveries(table_name, node_name) - + if not active_allocations: return [] - + recoveries = [] - + # Step 2: Get detailed recovery info for each active recovery for allocation in active_allocations: recovery_detail = self.get_recovery_details( - allocation['schema_name'], # This will be 'doc' default - allocation['table_name'], - allocation['shard_id'] + allocation["schema_name"], # This will be 'doc' default + allocation["table_name"], + allocation["shard_id"], ) - - if recovery_detail and recovery_detail.get('recovery'): + + if recovery_detail and recovery_detail.get("recovery"): # Update allocation with actual schema from sys.shards - allocation['schema_name'] = recovery_detail['schema_name'] + allocation["schema_name"] = recovery_detail["schema_name"] recovery_info = self._parse_recovery_info(allocation, recovery_detail) - + # Filter out completed recoveries unless include_transitioning is True if include_transitioning or not self._is_recovery_completed(recovery_info): recoveries.append(recovery_info) - + # Sort by recovery type, then by progress return sorted(recoveries, key=lambda r: (r.recovery_type, -r.overall_progress)) - - def _parse_recovery_info(self, allocation: Dict[str, Any], - shard_detail: Dict[str, Any]) -> RecoveryInfo: + + def _parse_recovery_info(self, allocation: Dict[str, Any], shard_detail: Dict[str, Any]) -> RecoveryInfo: """Parse recovery information from allocation and shard data""" - - recovery = shard_detail.get('recovery', {}) - + + recovery = shard_detail.get("recovery", {}) + # Extract recovery progress information - files_info = recovery.get('files', {}) - size_info = recovery.get('size', {}) - - files_percent = float(files_info.get('percent', 0.0)) - bytes_percent = float(size_info.get('percent', 0.0)) - + files_info = recovery.get("files", {}) + size_info = recovery.get("size", {}) + + files_percent = float(files_info.get("percent", 0.0)) + bytes_percent = float(size_info.get("percent", 0.0)) + # Calculate actual progress based on recovered vs used - files_recovered = files_info.get('recovered', 0) - files_used = files_info.get('used', 1) # Avoid division by zero - size_recovered = size_info.get('recovered', 0) - size_used = size_info.get('used', 1) # Avoid division by zero - + files_recovered = files_info.get("recovered", 0) + files_used = files_info.get("used", 1) # Avoid division by zero + size_recovered = size_info.get("recovered", 0) + size_used = size_info.get("used", 1) # Avoid division by zero + # Use actual progress if different from reported percent actual_files_percent = (files_recovered / files_used * 100.0) if files_used > 0 else files_percent actual_size_percent = (size_recovered / size_used * 100.0) if size_used > 0 else bytes_percent - + # Use the more conservative (lower) progress value final_files_percent = min(files_percent, actual_files_percent) final_bytes_percent = min(bytes_percent, actual_size_percent) - + # Get source node for PEER recoveries source_node = None - if recovery.get('type') == 'PEER': + if recovery.get("type") == "PEER": source_node = self._find_source_node_for_recovery( - shard_detail['schema_name'], - shard_detail['table_name'], - shard_detail['shard_id'], - shard_detail['node_id'] + shard_detail["schema_name"], + shard_detail["table_name"], + shard_detail["shard_id"], + shard_detail["node_id"], ) return RecoveryInfo( - schema_name=shard_detail['schema_name'], - table_name=shard_detail['table_name'], - shard_id=shard_detail['shard_id'], - node_name=shard_detail['node_name'], - node_id=shard_detail['node_id'], - recovery_type=recovery.get('type', 'UNKNOWN'), - stage=recovery.get('stage', 'UNKNOWN'), + schema_name=shard_detail["schema_name"], + table_name=shard_detail["table_name"], + shard_id=shard_detail["shard_id"], + node_name=shard_detail["node_name"], + node_id=shard_detail["node_id"], + recovery_type=recovery.get("type", "UNKNOWN"), + stage=recovery.get("stage", "UNKNOWN"), files_percent=final_files_percent, bytes_percent=final_bytes_percent, - total_time_ms=recovery.get('total_time', 0), - routing_state=shard_detail['routing_state'], - current_state=allocation['current_state'], - is_primary=shard_detail['primary'], - size_bytes=shard_detail.get('size', 0), + total_time_ms=recovery.get("total_time", 0), + routing_state=shard_detail["routing_state"], + current_state=allocation["current_state"], + is_primary=shard_detail["primary"], + size_bytes=shard_detail.get("size", 0), source_node_name=source_node, - translog_size_bytes=shard_detail.get('translog_size', 0) + translog_size_bytes=shard_detail.get("translog_size", 0), ) - - def _find_source_node_for_recovery(self, schema_name: str, table_name: str, shard_id: int, target_node_id: str) -> Optional[str]: + + def _find_source_node_for_recovery( + self, schema_name: str, table_name: str, shard_id: int, target_node_id: str + ) -> Optional[str]: """Find source node for PEER recovery by looking for primary or other replicas""" try: # First try to find the primary shard of the same table/shard @@ -551,12 +550,12 @@ def _find_source_node_for_recovery(self, schema_name: str, table_name: str, shar AND "primary" = true LIMIT 1 """ - + result = self.execute_query(query, [schema_name, table_name, shard_id, target_node_id]) - - if result.get('rows'): - return result['rows'][0][0] - + + if result.get("rows"): + return result["rows"][0][0] + # If no primary found, look for any started replica query_replica = """ SELECT node['name'] as node_name @@ -565,20 +564,22 @@ def _find_source_node_for_recovery(self, schema_name: str, table_name: str, shar AND state = 'STARTED' AND node['id'] != ? LIMIT 1 """ - + result = self.execute_query(query_replica, [schema_name, table_name, shard_id, target_node_id]) - - if result.get('rows'): - return result['rows'][0][0] - + + if result.get("rows"): + return result["rows"][0][0] + except Exception: # If query fails, just return None - pass - + logger.warning("Failed to find source node for recovery", exc_info=True) + return None def _is_recovery_completed(self, recovery_info: RecoveryInfo) -> bool: """Check if a recovery is completed but still transitioning""" - return (recovery_info.stage == 'DONE' and - recovery_info.files_percent >= 100.0 and - recovery_info.bytes_percent >= 100.0) \ No newline at end of file + return ( + recovery_info.stage == "DONE" + and recovery_info.files_percent >= 100.0 + and recovery_info.bytes_percent >= 100.0 + ) diff --git a/cratedb_toolkit/cli.py b/cratedb_toolkit/cli.py index 2410d5ec..80e0b395 100644 --- a/cratedb_toolkit/cli.py +++ b/cratedb_toolkit/cli.py @@ -3,8 +3,8 @@ from cratedb_toolkit.util.cli import boot_click -from .admin.xmover.cli import main as admin_xmover_cli from .adapter.rockset.cli import cli as rockset_cli +from .admin.xmover.cli import main as admin_xmover_cli from .cfr.cli import cli as cfr_cli from .cluster.cli import cli as cloud_cli from .cmd.tail.cli import cli as tail_cli diff --git a/pyproject.toml b/pyproject.toml index c12d4c32..bd3fa6be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -324,11 +324,12 @@ lint.extend-ignore = [ "S108", ] -lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` -lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused +lint.per-file-ignores."cratedb_toolkit/admin/xmover/analyzer.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused lint.per-file-ignores."doc/conf.py" = [ "A001", "ERA001" ] -lint.per-file-ignores."examples/*" = [ "ERA001", "F401", "T201", "T203" ] # Allow `print` and `pprint` -lint.per-file-ignores."tests/*" = [ "S101" ] # Allow use of `assert`, and `print`. +lint.per-file-ignores."examples/*" = [ "ERA001", "F401", "T201", "T203" ] # Allow `print` and `pprint` +lint.per-file-ignores."tests/*" = [ "S101" ] # Allow use of `assert`, and `print`. lint.per-file-ignores."tests/adapter/test_rockset.py" = [ "E402" ] lint.per-file-ignores."tests/info/test_http.py" = [ "E402" ] From 4efd1edea73e65930ef2075ed1eff1fcb81ab342 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 11:16:17 +0200 Subject: [PATCH 03/18] Admin/XMover: Type checking --- cratedb_toolkit/admin/xmover/analyzer.py | 30 +++++++++++++++--------- cratedb_toolkit/admin/xmover/cli.py | 12 +++++----- cratedb_toolkit/admin/xmover/database.py | 16 +++++++++---- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/cratedb_toolkit/admin/xmover/analyzer.py b/cratedb_toolkit/admin/xmover/analyzer.py index 36d43618..e160d21a 100644 --- a/cratedb_toolkit/admin/xmover/analyzer.py +++ b/cratedb_toolkit/admin/xmover/analyzer.py @@ -2,13 +2,16 @@ Shard analysis and rebalancing logic for CrateDB """ +import logging import math from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple, Union from .database import CrateDBClient, NodeInfo, RecoveryInfo, ShardInfo +logger = logging.getLogger(__name__) + @dataclass class MoveRecommendation: @@ -70,10 +73,10 @@ def __init__(self, client: CrateDBClient): self.nodes: List[NodeInfo] = [] self.shards: List[ShardInfo] = [] - # Initialize session-based caches for performance - self._zone_conflict_cache = {} - self._node_lookup_cache = {} - self._target_nodes_cache = {} + # Initialize session-based caches for performance. + self._zone_conflict_cache: Dict[Tuple[str, int, str], Union[str, None]] = {} + self._node_lookup_cache: Dict[str, Union[NodeInfo, None]] = {} + self._target_nodes_cache: Dict[Tuple[float, frozenset[Any], float, float], List[NodeInfo]] = {} self._cache_hits = 0 self._cache_misses = 0 @@ -99,8 +102,8 @@ def analyze_distribution(self, table_name: Optional[str] = None) -> Distribution total_size_gb = sum(s.size_gb for s in shards) # Count by zone and node - zone_counts = defaultdict(int) - node_counts = defaultdict(int) + zone_counts: Dict[str, int] = defaultdict(int) + node_counts: Dict[str, int] = defaultdict(int) for shard in shards: zone_counts[shard.zone] += 1 @@ -171,7 +174,7 @@ def check_zone_balance( shards = [s for s in shards if s.table_name == table_name] # Count shards by zone and type - zone_stats = defaultdict(lambda: {"PRIMARY": 0, "REPLICA": 0, "TOTAL": 0}) + zone_stats: Dict[str, Dict] = defaultdict(lambda: {"PRIMARY": 0, "REPLICA": 0, "TOTAL": 0}) for shard in shards: shard_type = shard.shard_type @@ -243,7 +246,7 @@ def generate_rebalancing_recommendations( source_node: If specified, only generate recommendations for shards on this node max_disk_usage_percent: Maximum disk usage percentage for target nodes """ - recommendations = [] + recommendations: List[MoveRecommendation] = [] # Get moveable shards (only healthy ones for actual operations) moveable_shards = self.find_moveable_shards(min_size_gb, max_size_gb, table_name) @@ -287,7 +290,12 @@ def generate_rebalancing_recommendations( total_evaluated = 0 for i, shard in enumerate(processing_shards): + if shard is None: + logger.info(f"Shard not found: {i}") + continue + if len(recommendations) >= max_recommendations: + logger.info(f"Found {len(recommendations)} recommendations for shard: {shard.shard_id}") break # Show progress every 50 shards when processing many @@ -344,6 +352,7 @@ def generate_rebalancing_recommendations( if not safe_target_nodes: continue # No safe targets found, skip this shard + target_node: NodeInfo if prioritize_space: # Space priority mode: choose node with most available space target_node = safe_target_nodes[0] # Already sorted by available space (desc) @@ -356,7 +365,6 @@ def generate_rebalancing_recommendations( # Choose target node with intelligent priority: # 1. If a node has significantly more space (2x) than zone-preferred nodes, prioritize space # 2. Otherwise, prefer zone balancing first, then available space - target_node = None if preferred_nodes and other_nodes: best_preferred = preferred_nodes[0] # Most space in preferred zones @@ -656,7 +664,7 @@ def get_cluster_overview(self) -> Dict[str, Any]: # Get cluster watermark settings watermarks = self.client.get_cluster_watermarks() - overview = { + overview: Dict[str, Any] = { "nodes": len(self.nodes), "zones": len({node.zone for node in self.nodes}), "total_shards": len(self.shards), diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 2ce29fdf..18f4c86f 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -4,7 +4,7 @@ import sys import time -from typing import Optional +from typing import Any, Dict, List, Optional, cast import click from rich import box @@ -13,7 +13,7 @@ from rich.table import Table from .analyzer import MoveRecommendation, RecoveryMonitor, ShardAnalyzer -from .database import CrateDBClient +from .database import CrateDBClient, ShardInfo console = Console() @@ -600,7 +600,7 @@ def zone_analysis(ctx, table: Optional[str], show_shards: bool): return # Organize by table and shard - tables = {} + tables: Dict[str, Dict[str, List[ShardInfo]]] = {} for shard in shards: table_key = f"{shard.schema_name}.{shard.table_name}" if table_key not in tables: @@ -850,7 +850,7 @@ def explain_error(ctx, error_message: Optional[str]): if not error_message: console.print("Please paste the CrateDB error message (press Enter twice when done):") - lines = [] + lines: List[str] = [] while True: try: line = input() @@ -933,7 +933,7 @@ def explain_error(ctx, error_message: Optional[str]): error_lower = error_message.lower() for pattern_info in error_patterns: - if pattern_info["pattern"].lower() in error_lower: + if cast(str, pattern_info["pattern"]).lower() in error_lower: matches.append(pattern_info) if matches: @@ -1013,7 +1013,7 @@ def monitor_recovery( console.print("=" * 80) # Track previous state for change detection - previous_recoveries = {} + previous_recoveries: Dict[str, Dict[str, Any]] = {} previous_timestamp = None first_run = True diff --git a/cratedb_toolkit/admin/xmover/database.py b/cratedb_toolkit/admin/xmover/database.py index a6e2d35f..55a6c194 100644 --- a/cratedb_toolkit/admin/xmover/database.py +++ b/cratedb_toolkit/admin/xmover/database.py @@ -5,7 +5,7 @@ import logging import os from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import requests from dotenv import load_dotenv @@ -118,7 +118,9 @@ class CrateDBClient: def __init__(self, connection_string: Optional[str] = None): load_dotenv() - self.connection_string = connection_string or os.getenv("CRATE_CONNECTION_STRING") + self.connection_string: str = ( + connection_string or os.getenv("CRATE_CONNECTION_STRING") or "http://localhost:4200" + ) if not self.connection_string: raise ValueError("CRATE_CONNECTION_STRING not found in environment or provided") @@ -132,7 +134,7 @@ def __init__(self, connection_string: Optional[str] = None): def execute_query(self, query: str, parameters: Optional[List] = None) -> Dict[str, Any]: """Execute a SQL query against CrateDB""" - payload = {"stmt": query} + payload: Dict[str, Any] = {"stmt": query} if parameters: payload["args"] = parameters @@ -207,7 +209,7 @@ def get_shards_info( if not for_analysis: # For operations, only include healthy shards where_conditions.extend(["s.routing_state = 'STARTED'", "s.recovery['files']['percent'] = 100.0"]) - parameters = [] + parameters: List[Union[str, int, Dict]] = [] if table_name: where_conditions.append("s.table_name = ?") @@ -297,7 +299,11 @@ def get_shard_distribution_summary(self, for_analysis: bool = True) -> Dict[str, result = self.execute_query(query) - summary = {"by_zone": {}, "by_node": {}, "totals": {"primary": 0, "replica": 0, "total_size_gb": 0}} + summary: Dict[str, Any] = { + "by_zone": {}, + "by_node": {}, + "totals": {"primary": 0, "replica": 0, "total_size_gb": 0}, + } for row in result.get("rows", []): zone = row[0] or "unknown" From 163001e459131b5c0a751920bad62ca22fb88b06 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 11:24:14 +0200 Subject: [PATCH 04/18] Admin/XMover: Implement suggestions by CodeRabbit --- CHANGES.md | 3 +-- doc/admin/xmover/handbook.md | 12 ++++++------ doc/admin/xmover/index.md | 2 +- doc/admin/xmover/queries.md | 12 +++++++++--- doc/admin/xmover/troubleshooting.md | 7 +++++-- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 7ef70478..bbc19439 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,8 +1,7 @@ # Changelog ## Unreleased -- Admin: Added XMover - CrateDB Shard Analyzer and Movement Tool. - Thanks, @WalBeh. +- Admin: Added XMover - CrateDB shard analyzer and movement tool. Thanks, @WalBeh. ## 2025/08/19 v0.0.41 - I/O: Updated to `influxio-0.6.0`. Thanks, @ZillKhan. diff --git a/doc/admin/xmover/handbook.md b/doc/admin/xmover/handbook.md index c103c6f2..cf9b4abe 100644 --- a/doc/admin/xmover/handbook.md +++ b/doc/admin/xmover/handbook.md @@ -368,7 +368,7 @@ xmover recommend --prioritize-zones --execute ### Connection String Format -``` +```text https://hostname:port ``` @@ -407,7 +407,7 @@ xmover analyze ### Common Issues and Solutions 1. **Zone Conflicts** - ``` + ```text Error: "NO(a copy of this shard is already allocated to this node)" ``` - **Cause**: Target node already has a copy of the shard @@ -415,7 +415,7 @@ xmover analyze - **Prevention**: Always use `xmover validate-move` before executing moves 2. **Zone Allocation Limits** - ``` + ```text Error: "too many copies of the shard allocated to nodes with attribute [zone]" ``` - **Cause**: CrateDB's zone awareness prevents too many copies in same zone @@ -423,7 +423,7 @@ xmover analyze - **Prevention**: Use `xmover recommend` which respects zone constraints 3. **Insufficient Space** - ``` + ```text Error: "not enough disk space" ``` - **Cause**: Target node lacks sufficient free space @@ -431,7 +431,7 @@ xmover analyze - **Check**: `xmover analyze` to see available space per node 4. **High Disk Usage Blocking Moves** - ``` + ```text Error: "Target node disk usage too high (85.3%)" ``` - **Cause**: Target node exceeds default 85% disk usage threshold @@ -465,7 +465,7 @@ XMover uses configurable safety thresholds to prevent risky moves: xmover recommend --max-disk-usage 90 --prioritize-space # For urgent space relief -xmover validate-move SCHEMA.TABLE SHARD_ID FROM TO --max-disk-usage 95 +xmover validate-move --max-disk-usage 95 ``` **When to Adjust Thresholds:** diff --git a/doc/admin/xmover/index.md b/doc/admin/xmover/index.md index 7b522310..affa4825 100644 --- a/doc/admin/xmover/index.md +++ b/doc/admin/xmover/index.md @@ -25,5 +25,5 @@ SQL commands for shard rebalancing and node decommissioning. Handbook Troubleshooting -Query gallery +Query Gallery ``` diff --git a/doc/admin/xmover/queries.md b/doc/admin/xmover/queries.md index 4600038c..27bd89e6 100644 --- a/doc/admin/xmover/queries.md +++ b/doc/admin/xmover/queries.md @@ -47,8 +47,15 @@ select node['name'], primary, sum(size) / 1024^3, count(id) from sys.shards g ``` ## Nodes available Space - ```sql +SELECT + name, + attributes['zone'] AS zone, + fs['total']['available'] / power(1024, 3) AS available_gb +FROM sys.nodes +ORDER BY name; +``` +```text +------------+--------------------+-----------------------------------------------+ | name | attributes['zone'] | (fs[1]['disks']['available'] / 1.073741824E9) | +------------+--------------------+-----------------------------------------------+ @@ -87,8 +94,7 @@ SELECT 8 rows in set (0.062 sec) ## Move REROUTE ```sql - -alter table "curvo"."bottlefieldData" reroute move shard 21 from 'data-hot-2' to 'data-hot-3'; +ALTER TABLE curvo.bottlefielddata REROUTE MOVE SHARD 21 FROM 'data-hot-2' TO 'data-hot-3'; ``` --- diff --git a/doc/admin/xmover/troubleshooting.md b/doc/admin/xmover/troubleshooting.md index 14567586..1afa477a 100644 --- a/doc/admin/xmover/troubleshooting.md +++ b/doc/admin/xmover/troubleshooting.md @@ -232,7 +232,10 @@ CRATE_SSL_VERIFY=true **Step 3: Test Network Access** ```bash # Test HTTP connectivity -curl -u username:password https://your-cluster:4200/_sql -d '{"stmt":"SELECT 1"}' +curl -u 'username:password' \ + -H 'Content-Type: application/json' \ + 'https://your-cluster:4200/_sql' \ + -d '{"stmt":"SELECT 1"}' ``` #### Prevention @@ -285,7 +288,7 @@ xmover explain-error "your error message here" 4. **Validate specific moves** ```bash - xmover validate-move SCHEMA.TABLE SHARD_ID FROM TO + xmover validate-move ``` 5. **Execute gradually** From de9ba97831f5a520a1ce884ff28fe8315bc927a4 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 13:24:55 +0200 Subject: [PATCH 05/18] Admin/XMover: Add software tests --- tests/admin/__init__.py | 0 tests/admin/test_cli.py | 67 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 tests/admin/__init__.py create mode 100644 tests/admin/test_cli.py diff --git a/tests/admin/__init__.py b/tests/admin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/admin/test_cli.py b/tests/admin/test_cli.py new file mode 100644 index 00000000..60e8d810 --- /dev/null +++ b/tests/admin/test_cli.py @@ -0,0 +1,67 @@ +import pytest +from click.testing import CliRunner + +from cratedb_toolkit.admin.xmover.cli import main as cli + + +@pytest.mark.parametrize( + "subcommand", + [ + "analyze", + "check-balance", + "explain-error", + "find-candidates", + "monitor-recovery", + "recommend", + "test-connection", + "zone-analysis", + ], +) +def test_xmover_all(cratedb, subcommand): + """ + CLI test: Invoke `xmover `. + """ + http_url = cratedb.get_http_url() + runner = CliRunner() + + result = runner.invoke( + cli, + args=subcommand, + env={"CRATE_CONNECTION_STRING": http_url}, + catch_exceptions=False, + ) + assert result.exit_code == 0 + + +def test_xmover_validate_move_success(cratedb): + """ + CLI test: Invoke `xmover validate-move`. + """ + http_url = cratedb.get_http_url() + runner = CliRunner() + + result = runner.invoke( + cli, + args=["validate-move", "doc.demo", "1", "42", "84"], + env={"CRATE_CONNECTION_STRING": http_url}, + catch_exceptions=False, + ) + assert result.exit_code == 0 + assert "Source node '42' not found in cluster" in result.output + + +def test_xmover_validate_move_failure(cratedb): + """ + CLI test: Invoke `xmover validate-move`. + """ + http_url = cratedb.get_http_url() + runner = CliRunner() + + result = runner.invoke( + cli, + args=["validate-move"], + env={"CRATE_CONNECTION_STRING": http_url}, + catch_exceptions=False, + ) + assert result.exit_code == 2 + assert "Error: Missing argument 'SCHEMA_TABLE'." in result.output From 0d7fcb5a9ec6e4e73dd2df5f52c8c7c6f230a278 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 19:19:56 +0200 Subject: [PATCH 06/18] Admin/XMover: Refactor -- "recovery" --- cratedb_toolkit/admin/xmover/analyzer.py | 150 +-------- cratedb_toolkit/admin/xmover/cli.py | 313 ++---------------- cratedb_toolkit/admin/xmover/database.py | 102 +----- cratedb_toolkit/admin/xmover/model.py | 101 ++++++ cratedb_toolkit/admin/xmover/recovery.py | 384 +++++++++++++++++++++++ cratedb_toolkit/admin/xmover/util.py | 45 +++ 6 files changed, 555 insertions(+), 540 deletions(-) create mode 100644 cratedb_toolkit/admin/xmover/model.py create mode 100644 cratedb_toolkit/admin/xmover/recovery.py create mode 100644 cratedb_toolkit/admin/xmover/util.py diff --git a/cratedb_toolkit/admin/xmover/analyzer.py b/cratedb_toolkit/admin/xmover/analyzer.py index e160d21a..f9b8d6a9 100644 --- a/cratedb_toolkit/admin/xmover/analyzer.py +++ b/cratedb_toolkit/admin/xmover/analyzer.py @@ -8,7 +8,8 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Set, Tuple, Union -from .database import CrateDBClient, NodeInfo, RecoveryInfo, ShardInfo +from .database import CrateDBClient +from .model import NodeInfo, ShardInfo logger = logging.getLogger(__name__) @@ -878,150 +879,3 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100. "estimated_time_hours": len(move_plan) * 0.1, # Rough estimate: 6 minutes per move "message": "Decommission plan generated" if feasible else "Decommission not currently feasible", } - - -class RecoveryMonitor: - """Monitor shard recovery operations""" - - def __init__(self, client: CrateDBClient): - self.client = client - - def get_cluster_recovery_status( - self, - table_name: Optional[str] = None, - node_name: Optional[str] = None, - recovery_type_filter: str = "all", - include_transitioning: bool = False, - ) -> List[RecoveryInfo]: - """Get comprehensive recovery status with minimal cluster impact""" - - # Get all recovering shards using the efficient combined query - recoveries = self.client.get_all_recovering_shards(table_name, node_name, include_transitioning) - - # Apply recovery type filter - if recovery_type_filter != "all": - recoveries = [r for r in recoveries if r.recovery_type.upper() == recovery_type_filter.upper()] - - return recoveries - - def get_recovery_summary(self, recoveries: List[RecoveryInfo]) -> Dict[str, Any]: - """Generate a summary of recovery operations""" - - if not recoveries: - return {"total_recoveries": 0, "by_type": {}, "by_stage": {}, "avg_progress": 0.0, "total_size_gb": 0.0} - - # Group by recovery type - by_type = {} - by_stage = {} - total_progress = 0.0 - total_size_gb = 0.0 - - for recovery in recoveries: - # By type - if recovery.recovery_type not in by_type: - by_type[recovery.recovery_type] = {"count": 0, "total_size_gb": 0.0, "avg_progress": 0.0} - by_type[recovery.recovery_type]["count"] += 1 - by_type[recovery.recovery_type]["total_size_gb"] += recovery.size_gb - - # By stage - if recovery.stage not in by_stage: - by_stage[recovery.stage] = 0 - by_stage[recovery.stage] += 1 - - # Totals - total_progress += recovery.overall_progress - total_size_gb += recovery.size_gb - - # Calculate averages - for type_name, rec_type in by_type.items(): - if rec_type["count"] > 0: - type_recoveries = [r for r in recoveries if r.recovery_type == type_name] - if type_recoveries: - rec_type["avg_progress"] = sum(r.overall_progress for r in type_recoveries) / len(type_recoveries) - - return { - "total_recoveries": len(recoveries), - "by_type": by_type, - "by_stage": by_stage, - "avg_progress": total_progress / len(recoveries) if recoveries else 0.0, - "total_size_gb": total_size_gb, - } - - def format_recovery_display(self, recoveries: List[RecoveryInfo]) -> str: - """Format recovery information for display""" - - if not recoveries: - return "βœ… No active shard recoveries found" - - # Group by recovery type - peer_recoveries = [r for r in recoveries if r.recovery_type == "PEER"] - disk_recoveries = [r for r in recoveries if r.recovery_type == "DISK"] - other_recoveries = [r for r in recoveries if r.recovery_type not in ["PEER", "DISK"]] - - output = [f"\nπŸ”„ Active Shard Recoveries ({len(recoveries)} total)"] - output.append("=" * 80) - - if peer_recoveries: - output.append(f"\nπŸ“‘ PEER Recoveries ({len(peer_recoveries)})") - output.append(self._format_recovery_table(peer_recoveries)) - - if disk_recoveries: - output.append(f"\nπŸ’Ύ DISK Recoveries ({len(disk_recoveries)})") - output.append(self._format_recovery_table(disk_recoveries)) - - if other_recoveries: - output.append(f"\nπŸ”§ Other Recoveries ({len(other_recoveries)})") - output.append(self._format_recovery_table(other_recoveries)) - - # Add summary - summary = self.get_recovery_summary(recoveries) - output.append("\nπŸ“Š Summary:") - output.append(f" Total size: {summary['total_size_gb']:.1f} GB") - output.append(f" Average progress: {summary['avg_progress']:.1f}%") - - return "\n".join(output) - - def _format_recovery_table(self, recoveries: List[RecoveryInfo]) -> str: - """Format a table of recovery information""" - - if not recoveries: - return " No recoveries of this type" - - # Table headers - headers = ["Table", "Shard", "Node", "Type", "Stage", "Progress", "Size(GB)", "Time(s)"] - - # Calculate column widths - col_widths = [len(h) for h in headers] - - rows = [] - for recovery in recoveries: - row = [ - f"{recovery.schema_name}.{recovery.table_name}", - str(recovery.shard_id), - recovery.node_name, - recovery.shard_type, - recovery.stage, - f"{recovery.overall_progress:.1f}%", - f"{recovery.size_gb:.1f}", - f"{recovery.total_time_seconds:.1f}", - ] - rows.append(row) - - # Update column widths - for i, cell in enumerate(row): - col_widths[i] = max(col_widths[i], len(cell)) - - # Format table - output = [] - - # Header row - header_row = " " + " | ".join(h.ljust(w) for h, w in zip(headers, col_widths)) - output.append(header_row) - output.append(" " + "-" * (len(header_row) - 3)) - - # Data rows - for row in rows: - data_row = " " + " | ".join(cell.ljust(w) for cell, w in zip(row, col_widths)) - output.append(data_row) - - return "\n".join(output) diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 18f4c86f..2ad88d15 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -4,7 +4,7 @@ import sys import time -from typing import Any, Dict, List, Optional, cast +from typing import Dict, List, Optional, cast import click from rich import box @@ -12,57 +12,14 @@ from rich.panel import Panel from rich.table import Table -from .analyzer import MoveRecommendation, RecoveryMonitor, ShardAnalyzer -from .database import CrateDBClient, ShardInfo +from cratedb_toolkit.admin.xmover.model import ShardInfo -console = Console() - - -def format_size(size_gb: float) -> str: - """Format size in GB with appropriate precision""" - if size_gb >= 1000: - return f"{size_gb / 1000:.1f}TB" - elif size_gb >= 1: - return f"{size_gb:.1f}GB" - else: - return f"{size_gb * 1000:.0f}MB" - - -def format_percentage(value: float) -> str: - """Format percentage with color coding""" - color = "green" - if value > 80: - color = "red" - elif value > 70: - color = "yellow" - return f"[{color}]{value:.1f}%[/{color}]" - - -def format_translog_info(recovery_info) -> str: - """Format translog size information with color coding""" - tl_bytes = recovery_info.translog_size_bytes - - # Only show if significant (>10MB for production) - if tl_bytes < 10 * 1024 * 1024: # 10MB for production - return "" - - tl_gb = recovery_info.translog_size_gb +from .analyzer import MoveRecommendation, ShardAnalyzer +from .database import CrateDBClient +from .recovery import RecoveryMonitor, RecoveryOptions +from .util import format_percentage, format_size - # Color coding based on size - if tl_gb >= 5.0: - color = "red" - elif tl_gb >= 1.0: - color = "yellow" - else: - color = "green" - - # Format size - if tl_gb >= 1.0: - size_str = f"{tl_gb:.1f}GB" - else: - size_str = f"{tl_gb * 1000:.0f}MB" - - return f" [dim]([{color}]TL:{size_str}[/{color}])[/dim]" +console = Console() @click.group() @@ -998,254 +955,28 @@ def monitor_recovery( xmover monitor-recovery --watch # Continuous monitoring xmover monitor-recovery --recovery-type PEER # Only PEER recoveries """ - try: - client = ctx.obj["client"] - recovery_monitor = RecoveryMonitor(client) - - if watch: - console.print(f"πŸ”„ Monitoring shard recoveries (refreshing every {refresh_interval}s)") - console.print("Press Ctrl+C to stop") - console.print() - - try: - # Show header once - console.print("πŸ“Š Recovery Progress Monitor") - console.print("=" * 80) - - # Track previous state for change detection - previous_recoveries: Dict[str, Dict[str, Any]] = {} - previous_timestamp = None - first_run = True - - while True: - # Get current recovery status - recoveries = recovery_monitor.get_cluster_recovery_status( - table_name=table, - node_name=node, - recovery_type_filter=recovery_type, - include_transitioning=include_transitioning, - ) - - # Display current time - from datetime import datetime - - current_time = datetime.now().strftime("%H:%M:%S") - - # Check for any changes - changes = [] - active_count = 0 - completed_count = 0 - - for recovery in recoveries: - recovery_key = ( - f"{recovery.schema_name}.{recovery.table_name}.{recovery.shard_id}.{recovery.node_name}" - ) - - # Create complete table name - if recovery.schema_name == "doc": - table_display = recovery.table_name - else: - table_display = f"{recovery.schema_name}.{recovery.table_name}" - - # Count active vs completed - if recovery.stage == "DONE" and recovery.overall_progress >= 100.0: - completed_count += 1 - else: - active_count += 1 - - # Check for changes since last update - if recovery_key in previous_recoveries: - prev = previous_recoveries[recovery_key] - if prev["progress"] != recovery.overall_progress: - diff = recovery.overall_progress - prev["progress"] - # Create node route display - node_route = "" - if recovery.recovery_type == "PEER" and recovery.source_node_name: - node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" - elif recovery.recovery_type == "DISK": - node_route = f" disk β†’ {recovery.node_name}" - - # Add translog info - translog_info = format_translog_info(recovery) - - if diff > 0: - changes.append( - f"[green]πŸ“ˆ[/green] {table_display} S{recovery.shard_id} " - f"{recovery.overall_progress:.1f}% (+{diff:.1f}%) " - f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" - ) - else: - changes.append( - f"[yellow]πŸ“‰[/yellow] {table_display} S{recovery.shard_id} " - f"{recovery.overall_progress:.1f}% ({diff:.1f}%) " - f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" - ) - elif prev["stage"] != recovery.stage: - # Create node route display - node_route = "" - if recovery.recovery_type == "PEER" and recovery.source_node_name: - node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" - elif recovery.recovery_type == "DISK": - node_route = f" disk β†’ {recovery.node_name}" - - # Add translog info - translog_info = format_translog_info(recovery) - - changes.append( - f"[blue]πŸ”„[/blue] {table_display} S{recovery.shard_id} " - f"{prev['stage']}β†’{recovery.stage} " - f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" - ) - else: - # New recovery - show based on include_transitioning flag or first run - if ( - first_run - or include_transitioning - or (recovery.overall_progress < 100.0 or recovery.stage != "DONE") - ): - # Create node route display - node_route = "" - if recovery.recovery_type == "PEER" and recovery.source_node_name: - node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" - elif recovery.recovery_type == "DISK": - node_route = f" disk β†’ {recovery.node_name}" - - status_icon = "[cyan]πŸ†•[/cyan]" if not first_run else "[blue]πŸ“‹[/blue]" - # Add translog info - translog_info = format_translog_info(recovery) - - changes.append( - f"{status_icon} {table_display} S{recovery.shard_id} " - f"{recovery.stage} {recovery.overall_progress:.1f}% " - f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" - ) - - # Store current state for next comparison - previous_recoveries[recovery_key] = { - "progress": recovery.overall_progress, - "stage": recovery.stage, - } - - # Always show a status line - if not recoveries: - console.print(f"{current_time} | [green]No recoveries - cluster stable[/green]") - previous_recoveries.clear() - else: - # Build status message - status = "" - if active_count > 0: - status = f"{active_count} active" - if completed_count > 0: - status += f", {completed_count} done" if status else f"{completed_count} done" - - # Show status line with changes or periodic update - if changes: - console.print(f"{current_time} | {status}") - for change in changes: - console.print(f" | {change}") - else: - # Show periodic status even without changes - if include_transitioning and completed_count > 0: - console.print(f"{current_time} | {status} (transitioning)") - elif active_count > 0: - console.print(f"{current_time} | {status} (no changes)") - - previous_timestamp = current_time # noqa: F841 - first_run = False - time.sleep(refresh_interval) - - except KeyboardInterrupt: - console.print("\n\n[yellow]⏹ Monitoring stopped by user[/yellow]") - - # Show final summary - final_recoveries = recovery_monitor.get_cluster_recovery_status( - table_name=table, - node_name=node, - recovery_type_filter=recovery_type, - include_transitioning=include_transitioning, - ) - - if final_recoveries: - console.print("\nπŸ“Š [bold]Final Recovery Summary:[/bold]") - summary = recovery_monitor.get_recovery_summary(final_recoveries) - - # Count active vs completed - active_count = len([r for r in final_recoveries if r.overall_progress < 100.0 or r.stage != "DONE"]) - completed_count = len(final_recoveries) - active_count - - console.print(f" Total recoveries: {summary['total_recoveries']}") - console.print(f" Active: {active_count}, Completed: {completed_count}") - console.print(f" Total size: {summary['total_size_gb']:.1f} GB") - console.print(f" Average progress: {summary['avg_progress']:.1f}%") - - if summary["by_type"]: - console.print(" By recovery type:") - for rec_type, stats in summary["by_type"].items(): - console.print( - f" {rec_type}: {stats['count']} recoveries, " - f"{stats['avg_progress']:.1f}% avg progress" - ) - else: - console.print("\n[green]βœ… No active recoveries at exit[/green]") - - return - - else: - # Single status check - recoveries = recovery_monitor.get_cluster_recovery_status( - table_name=table, - node_name=node, - recovery_type_filter=recovery_type, - include_transitioning=include_transitioning, - ) - - display_output = recovery_monitor.format_recovery_display(recoveries) - console.print(display_output) - - if not recoveries: - if include_transitioning: - console.print("\n[green]βœ… No recoveries found (active or transitioning)[/green]") - else: - console.print("\n[green]βœ… No active recoveries found[/green]") - console.print( - "[dim]πŸ’‘ Use --include-transitioning to see completed recoveries still transitioning[/dim]" - ) - else: - # Show summary - summary = recovery_monitor.get_recovery_summary(recoveries) - console.print("\nπŸ“Š [bold]Recovery Summary:[/bold]") - console.print(f" Total recoveries: {summary['total_recoveries']}") - console.print(f" Total size: {summary['total_size_gb']:.1f} GB") - console.print(f" Average progress: {summary['avg_progress']:.1f}%") - - # Show breakdown by type - if summary["by_type"]: - console.print("\n By recovery type:") - for rec_type, stats in summary["by_type"].items(): - console.print( - f" {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress" - ) - - console.print("\n[dim]πŸ’‘ Use --watch flag for continuous monitoring[/dim]") - - except Exception as e: - console.print(f"[red]❌ Error monitoring recoveries: {e}[/red]") - if ctx.obj.get("debug"): - raise + recovery_monitor = RecoveryMonitor( + client=ctx.obj["client"], + options=RecoveryOptions( + table=table, + node=node, + refresh_interval=refresh_interval, + recovery_type=recovery_type, + include_transitioning=include_transitioning, + ), + ) + recovery_monitor.start(watch=watch, debug=ctx.obj.get("debug")) def _wait_for_recovery_capacity(client, max_concurrent_recoveries: int = 5): """Wait until active recovery count is below threshold""" - from time import sleep - from .analyzer import RecoveryMonitor - - recovery_monitor = RecoveryMonitor(client) + recovery_monitor = RecoveryMonitor(client, RecoveryOptions(include_transitioning=True)) wait_time = 0 while True: # Check active recoveries (including transitioning) - recoveries = recovery_monitor.get_cluster_recovery_status(include_transitioning=True) + recoveries = recovery_monitor.get_cluster_recovery_status() active_count = len([r for r in recoveries if r.overall_progress < 100.0 or r.stage != "DONE"]) status = f"{active_count}/{max_concurrent_recoveries}" if active_count < max_concurrent_recoveries: @@ -1257,15 +988,13 @@ def _wait_for_recovery_capacity(client, max_concurrent_recoveries: int = 5): elif wait_time % 30 == 0: # Update every 30 seconds console.print(f" [yellow]⏳ Still waiting... ({status} active)[/yellow]") - sleep(10) # Check every 10 seconds + time.sleep(10) # Check every 10 seconds wait_time += 10 def _execute_recommendations_safely(client, recommendations, validate: bool): """Execute recommendations with extensive safety measures""" - from .analyzer import ShardAnalyzer - # Filter to only safe recommendations safe_recommendations = [] if validate: diff --git a/cratedb_toolkit/admin/xmover/database.py b/cratedb_toolkit/admin/xmover/database.py index 55a6c194..1cb16bb1 100644 --- a/cratedb_toolkit/admin/xmover/database.py +++ b/cratedb_toolkit/admin/xmover/database.py @@ -4,112 +4,14 @@ import logging import os -from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union import requests from dotenv import load_dotenv -logger = logging.getLogger(__name__) - +from cratedb_toolkit.admin.xmover.model import NodeInfo, RecoveryInfo, ShardInfo -@dataclass -class NodeInfo: - """Information about a CrateDB node""" - - id: str - name: str - zone: str - heap_used: int - heap_max: int - fs_total: int - fs_used: int - fs_available: int - - @property - def heap_usage_percent(self) -> float: - return (self.heap_used / self.heap_max) * 100 if self.heap_max > 0 else 0 - - @property - def disk_usage_percent(self) -> float: - return (self.fs_used / self.fs_total) * 100 if self.fs_total > 0 else 0 - - @property - def available_space_gb(self) -> float: - return self.fs_available / (1024**3) - - -@dataclass -class ShardInfo: - """Information about a shard""" - - table_name: str - schema_name: str - shard_id: int - node_id: str - node_name: str - zone: str - is_primary: bool - size_bytes: int - size_gb: float - num_docs: int - state: str - routing_state: str - - @property - def shard_type(self) -> str: - return "PRIMARY" if self.is_primary else "REPLICA" - - -@dataclass -class RecoveryInfo: - """Information about an active shard recovery""" - - schema_name: str - table_name: str - shard_id: int - node_name: str - node_id: str - recovery_type: str # PEER, DISK, etc. - stage: str # INIT, INDEX, VERIFY_INDEX, TRANSLOG, FINALIZE, DONE - files_percent: float - bytes_percent: float - total_time_ms: int - routing_state: str # INITIALIZING, RELOCATING, etc. - current_state: str # from allocations - is_primary: bool - size_bytes: int - source_node_name: Optional[str] = None # Source node for PEER recoveries - translog_size_bytes: int = 0 # Translog size in bytes - - @property - def overall_progress(self) -> float: - """Calculate overall progress percentage""" - return max(self.files_percent, self.bytes_percent) - - @property - def size_gb(self) -> float: - """Size in GB""" - return self.size_bytes / (1024**3) - - @property - def shard_type(self) -> str: - return "PRIMARY" if self.is_primary else "REPLICA" - - @property - def total_time_seconds(self) -> float: - """Total time in seconds""" - return self.total_time_ms / 1000.0 - - @property - def translog_size_gb(self) -> float: - """Translog size in GB""" - return self.translog_size_bytes / (1024**3) - - @property - def translog_percentage(self) -> float: - """Translog size as percentage of shard size""" - return (self.translog_size_bytes / self.size_bytes * 100) if self.size_bytes > 0 else 0 +logger = logging.getLogger(__name__) class CrateDBClient: diff --git a/cratedb_toolkit/admin/xmover/model.py b/cratedb_toolkit/admin/xmover/model.py new file mode 100644 index 00000000..4d271445 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/model.py @@ -0,0 +1,101 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class NodeInfo: + """Information about a CrateDB node""" + + id: str + name: str + zone: str + heap_used: int + heap_max: int + fs_total: int + fs_used: int + fs_available: int + + @property + def heap_usage_percent(self) -> float: + return (self.heap_used / self.heap_max) * 100 if self.heap_max > 0 else 0 + + @property + def disk_usage_percent(self) -> float: + return (self.fs_used / self.fs_total) * 100 if self.fs_total > 0 else 0 + + @property + def available_space_gb(self) -> float: + return self.fs_available / (1024**3) + + +@dataclass +class ShardInfo: + """Information about a shard""" + + table_name: str + schema_name: str + shard_id: int + node_id: str + node_name: str + zone: str + is_primary: bool + size_bytes: int + size_gb: float + num_docs: int + state: str + routing_state: str + + @property + def shard_type(self) -> str: + return "PRIMARY" if self.is_primary else "REPLICA" + + +@dataclass +class RecoveryInfo: + """Information about an active shard recovery""" + + schema_name: str + table_name: str + shard_id: int + node_name: str + node_id: str + recovery_type: str # PEER, DISK, etc. + stage: str # INIT, INDEX, VERIFY_INDEX, TRANSLOG, FINALIZE, DONE + files_percent: float + bytes_percent: float + total_time_ms: int + routing_state: str # INITIALIZING, RELOCATING, etc. + current_state: str # from allocations + is_primary: bool + size_bytes: int + source_node_name: Optional[str] = None # Source node for PEER recoveries + translog_size_bytes: int = 0 # Translog size in bytes + + @property + def overall_progress(self) -> float: + """Calculate overall progress percentage""" + return max(self.files_percent, self.bytes_percent) + + @property + def size_gb(self) -> float: + """Size in GB""" + return self.size_bytes / (1024**3) + + @property + def shard_type(self) -> str: + return "PRIMARY" if self.is_primary else "REPLICA" + + @property + def total_time_seconds(self) -> float: + """Total time in seconds""" + return self.total_time_ms / 1000.0 + + @property + def translog_size_gb(self) -> float: + """Translog size in GB""" + return self.translog_size_bytes / (1024**3) + + @property + def translog_percentage(self) -> float: + """Translog size as percentage of shard size""" + return (self.translog_size_bytes / self.size_bytes * 100) if self.size_bytes > 0 else 0 diff --git a/cratedb_toolkit/admin/xmover/recovery.py b/cratedb_toolkit/admin/xmover/recovery.py new file mode 100644 index 00000000..958aed15 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/recovery.py @@ -0,0 +1,384 @@ +import dataclasses +import time +from datetime import datetime +from typing import Any, Dict, List, Optional + +from rich.console import Console + +from cratedb_toolkit.admin.xmover.database import CrateDBClient +from cratedb_toolkit.admin.xmover.model import RecoveryInfo +from cratedb_toolkit.admin.xmover.util import format_translog_info + +console = Console() + + +@dataclasses.dataclass +class RecoveryOptions: + table: Optional[str] = None + node: Optional[str] = None + refresh_interval: int = 10 + include_transitioning: bool = False + recovery_type: Optional[str] = None + + +class RecoveryMonitor: + """Monitor shard recovery operations""" + + def __init__(self, client: CrateDBClient, options: RecoveryOptions): + self.client = client + self.options = options + + def get_cluster_recovery_status(self) -> List[RecoveryInfo]: + """Get comprehensive recovery status with minimal cluster impact""" + + # Get all recovering shards using the efficient combined query + recoveries = self.client.get_all_recovering_shards( + self.options.table, self.options.node, self.options.include_transitioning + ) + + # Apply recovery type filter + if self.options.recovery_type is not None: + recoveries = [r for r in recoveries if r.recovery_type.upper() == self.options.recovery_type.upper()] + + return recoveries + + def get_recovery_summary(self, recoveries: List[RecoveryInfo]) -> Dict[str, Any]: + """Generate a summary of recovery operations""" + + if not recoveries: + return {"total_recoveries": 0, "by_type": {}, "by_stage": {}, "avg_progress": 0.0, "total_size_gb": 0.0} + + # Group by recovery type + by_type = {} + by_stage = {} + total_progress = 0.0 + total_size_gb = 0.0 + + for recovery in recoveries: + # By type + if recovery.recovery_type not in by_type: + by_type[recovery.recovery_type] = {"count": 0, "total_size_gb": 0.0, "avg_progress": 0.0} + by_type[recovery.recovery_type]["count"] += 1 + by_type[recovery.recovery_type]["total_size_gb"] += recovery.size_gb + + # By stage + if recovery.stage not in by_stage: + by_stage[recovery.stage] = 0 + by_stage[recovery.stage] += 1 + + # Totals + total_progress += recovery.overall_progress + total_size_gb += recovery.size_gb + + # Calculate averages + for type_name, rec_type in by_type.items(): + if rec_type["count"] > 0: + type_recoveries = [r for r in recoveries if r.recovery_type == type_name] + if type_recoveries: + rec_type["avg_progress"] = sum(r.overall_progress for r in type_recoveries) / len(type_recoveries) + + return { + "total_recoveries": len(recoveries), + "by_type": by_type, + "by_stage": by_stage, + "avg_progress": total_progress / len(recoveries) if recoveries else 0.0, + "total_size_gb": total_size_gb, + } + + def format_recovery_display(self, recoveries: List[RecoveryInfo]) -> str: + """Format recovery information for display""" + + if not recoveries: + return "βœ… No active shard recoveries found" + + # Group by recovery type + peer_recoveries = [r for r in recoveries if r.recovery_type == "PEER"] + disk_recoveries = [r for r in recoveries if r.recovery_type == "DISK"] + other_recoveries = [r for r in recoveries if r.recovery_type not in ["PEER", "DISK"]] + + output = [f"\nπŸ”„ Active Shard Recoveries ({len(recoveries)} total)"] + output.append("=" * 80) + + if peer_recoveries: + output.append(f"\nπŸ“‘ PEER Recoveries ({len(peer_recoveries)})") + output.append(self._format_recovery_table(peer_recoveries)) + + if disk_recoveries: + output.append(f"\nπŸ’Ύ DISK Recoveries ({len(disk_recoveries)})") + output.append(self._format_recovery_table(disk_recoveries)) + + if other_recoveries: + output.append(f"\nπŸ”§ Other Recoveries ({len(other_recoveries)})") + output.append(self._format_recovery_table(other_recoveries)) + + # Add summary + summary = self.get_recovery_summary(recoveries) + output.append("\nπŸ“Š Summary:") + output.append(f" Total size: {summary['total_size_gb']:.1f} GB") + output.append(f" Average progress: {summary['avg_progress']:.1f}%") + + return "\n".join(output) + + def _format_recovery_table(self, recoveries: List[RecoveryInfo]) -> str: + """Format a table of recovery information""" + + if not recoveries: + return " No recoveries of this type" + + # Table headers + headers = ["Table", "Shard", "Node", "Type", "Stage", "Progress", "Size(GB)", "Time(s)"] + + # Calculate column widths + col_widths = [len(h) for h in headers] + + rows = [] + for recovery in recoveries: + row = [ + f"{recovery.schema_name}.{recovery.table_name}", + str(recovery.shard_id), + recovery.node_name, + recovery.shard_type, + recovery.stage, + f"{recovery.overall_progress:.1f}%", + f"{recovery.size_gb:.1f}", + f"{recovery.total_time_seconds:.1f}", + ] + rows.append(row) + + # Update column widths + for i, cell in enumerate(row): + col_widths[i] = max(col_widths[i], len(cell)) + + # Format table + output = [] + + # Header row + header_row = " " + " | ".join(h.ljust(w) for h, w in zip(headers, col_widths)) + output.append(header_row) + output.append(" " + "-" * (len(header_row) - 3)) + + # Data rows + for row in rows: + data_row = " " + " | ".join(cell.ljust(w) for cell, w in zip(row, col_widths)) + output.append(data_row) + + return "\n".join(output) + + def start(self, watch: bool, debug: bool = False): + try: + if watch: + console.print(f"πŸ”„ Monitoring shard recoveries (refreshing every {self.options.refresh_interval}s)") + console.print("Press Ctrl+C to stop") + console.print() + + try: + # Show header once + console.print("πŸ“Š Recovery Progress Monitor") + console.print("=" * 80) + + # Track previous state for change detection + previous_recoveries: Dict[str, Dict[str, Any]] = {} + previous_timestamp = None + first_run = True + + while True: + # Get current recovery status + recoveries = self.get_cluster_recovery_status() + + # Display current time + current_time = datetime.now().strftime("%H:%M:%S") + + # Check for any changes + changes = [] + active_count = 0 + completed_count = 0 + + for recovery in recoveries: + recovery_key = ( + f"{recovery.schema_name}.{recovery.table_name}.{recovery.shard_id}.{recovery.node_name}" + ) + + # Create complete table name + if recovery.schema_name == "doc": + table_display = recovery.table_name + else: + table_display = f"{recovery.schema_name}.{recovery.table_name}" + + # Count active vs completed + if recovery.stage == "DONE" and recovery.overall_progress >= 100.0: + completed_count += 1 + else: + active_count += 1 + + # Check for changes since last update + if recovery_key in previous_recoveries: + prev = previous_recoveries[recovery_key] + if prev["progress"] != recovery.overall_progress: + diff = recovery.overall_progress - prev["progress"] + # Create node route display + node_route = "" + if recovery.recovery_type == "PEER" and recovery.source_node_name: + node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" + elif recovery.recovery_type == "DISK": + node_route = f" disk β†’ {recovery.node_name}" + + # Add translog info + translog_info = format_translog_info(recovery) + + if diff > 0: + changes.append( + f"[green]πŸ“ˆ[/green] {table_display} S{recovery.shard_id} " + f"{recovery.overall_progress:.1f}% (+{diff:.1f}%) " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) + else: + changes.append( + f"[yellow]πŸ“‰[/yellow] {table_display} S{recovery.shard_id} " + f"{recovery.overall_progress:.1f}% ({diff:.1f}%) " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) + elif prev["stage"] != recovery.stage: + # Create node route display + node_route = "" + if recovery.recovery_type == "PEER" and recovery.source_node_name: + node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" + elif recovery.recovery_type == "DISK": + node_route = f" disk β†’ {recovery.node_name}" + + # Add translog info + translog_info = format_translog_info(recovery) + + changes.append( + f"[blue]πŸ”„[/blue] {table_display} S{recovery.shard_id} " + f"{prev['stage']}β†’{recovery.stage} " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) + else: + # New recovery - show based on include_transitioning flag or first run + if ( + first_run + or self.options.include_transitioning + or (recovery.overall_progress < 100.0 or recovery.stage != "DONE") + ): + # Create node route display + node_route = "" + if recovery.recovery_type == "PEER" and recovery.source_node_name: + node_route = f" {recovery.source_node_name} β†’ {recovery.node_name}" + elif recovery.recovery_type == "DISK": + node_route = f" disk β†’ {recovery.node_name}" + + status_icon = "[cyan]πŸ†•[/cyan]" if not first_run else "[blue]πŸ“‹[/blue]" + # Add translog info + translog_info = format_translog_info(recovery) + + changes.append( + f"{status_icon} {table_display} S{recovery.shard_id} " + f"{recovery.stage} {recovery.overall_progress:.1f}% " + f"{recovery.size_gb:.1f}GB{translog_info}{node_route}" + ) + + # Store current state for next comparison + previous_recoveries[recovery_key] = { + "progress": recovery.overall_progress, + "stage": recovery.stage, + } + + # Always show a status line + if not recoveries: + console.print(f"{current_time} | [green]No recoveries - cluster stable[/green]") + previous_recoveries.clear() + else: + # Build status message + status = "" + if active_count > 0: + status = f"{active_count} active" + if completed_count > 0: + status += f", {completed_count} done" if status else f"{completed_count} done" + + # Show status line with changes or periodic update + if changes: + console.print(f"{current_time} | {status}") + for change in changes: + console.print(f" | {change}") + else: + # Show periodic status even without changes + if self.options.include_transitioning and completed_count > 0: + console.print(f"{current_time} | {status} (transitioning)") + elif active_count > 0: + console.print(f"{current_time} | {status} (no changes)") + + previous_timestamp = current_time # noqa: F841 + first_run = False + time.sleep(self.options.refresh_interval) + + except KeyboardInterrupt: + console.print("\n\n[yellow]⏹ Monitoring stopped by user[/yellow]") + + # Show final summary + final_recoveries = self.get_cluster_recovery_status() + + if final_recoveries: + console.print("\nπŸ“Š [bold]Final Recovery Summary:[/bold]") + summary = self.get_recovery_summary(final_recoveries) + + # Count active vs completed + active_count = len( + [r for r in final_recoveries if r.overall_progress < 100.0 or r.stage != "DONE"] + ) + completed_count = len(final_recoveries) - active_count + + console.print(f" Total recoveries: {summary['total_recoveries']}") + console.print(f" Active: {active_count}, Completed: {completed_count}") + console.print(f" Total size: {summary['total_size_gb']:.1f} GB") + console.print(f" Average progress: {summary['avg_progress']:.1f}%") + + if summary["by_type"]: + console.print(" By recovery type:") + for rec_type, stats in summary["by_type"].items(): + console.print( + f" {rec_type}: {stats['count']} recoveries, " + f"{stats['avg_progress']:.1f}% avg progress" + ) + else: + console.print("\n[green]βœ… No active recoveries at exit[/green]") + + return + + else: + # Single status check + recoveries = self.get_cluster_recovery_status() + + display_output = self.format_recovery_display(recoveries) + console.print(display_output) + + if not recoveries: + if self.options.include_transitioning: + console.print("\n[green]βœ… No recoveries found (active or transitioning)[/green]") + else: + console.print("\n[green]βœ… No active recoveries found[/green]") + console.print( + "[dim]πŸ’‘ Use --include-transitioning to see completed recoveries still transitioning[/dim]" + ) + else: + # Show summary + summary = self.get_recovery_summary(recoveries) + console.print("\nπŸ“Š [bold]Recovery Summary:[/bold]") + console.print(f" Total recoveries: {summary['total_recoveries']}") + console.print(f" Total size: {summary['total_size_gb']:.1f} GB") + console.print(f" Average progress: {summary['avg_progress']:.1f}%") + + # Show breakdown by type + if summary["by_type"]: + console.print("\n By recovery type:") + for rec_type, stats in summary["by_type"].items(): + console.print( + f" {rec_type}: {stats['count']} recoveries, " + f"{stats['avg_progress']:.1f}% avg progress" + ) + + console.print("\n[dim]πŸ’‘ Use --watch flag for continuous monitoring[/dim]") + + except Exception as e: + console.print(f"[red]❌ Error monitoring recoveries: {e}[/red]") + if debug: + raise diff --git a/cratedb_toolkit/admin/xmover/util.py b/cratedb_toolkit/admin/xmover/util.py new file mode 100644 index 00000000..82c8a3d0 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/util.py @@ -0,0 +1,45 @@ +def format_size(size_gb: float) -> str: + """Format size in GB with appropriate precision""" + if size_gb >= 1000: + return f"{size_gb / 1000:.1f}TB" + elif size_gb >= 1: + return f"{size_gb:.1f}GB" + else: + return f"{size_gb * 1000:.0f}MB" + + +def format_percentage(value: float) -> str: + """Format percentage with color coding""" + color = "green" + if value > 80: + color = "red" + elif value > 70: + color = "yellow" + return f"[{color}]{value:.1f}%[/{color}]" + + +def format_translog_info(recovery_info) -> str: + """Format translog size information with color coding""" + tl_bytes = recovery_info.translog_size_bytes + + # Only show if significant (>10MB for production) + if tl_bytes < 10 * 1024 * 1024: # 10MB for production + return "" + + tl_gb = recovery_info.translog_size_gb + + # Color coding based on size + if tl_gb >= 5.0: + color = "red" + elif tl_gb >= 1.0: + color = "yellow" + else: + color = "green" + + # Format size + if tl_gb >= 1.0: + size_str = f"{tl_gb:.1f}GB" + else: + size_str = f"{tl_gb * 1000:.0f}MB" + + return f" [dim]([{color}]TL:{size_str}[/{color}])[/dim]" From 99f941da0a4fc6a4d329b949d85994aa48b25d59 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 19:44:50 +0200 Subject: [PATCH 07/18] Admin/XMover: Refactor -- "recommender" --- cratedb_toolkit/admin/xmover/analyzer.py | 103 ++---- cratedb_toolkit/admin/xmover/cli.py | 355 ++----------------- cratedb_toolkit/admin/xmover/model.py | 68 +++- cratedb_toolkit/admin/xmover/recommender.py | 366 ++++++++++++++++++++ 4 files changed, 476 insertions(+), 416 deletions(-) create mode 100644 cratedb_toolkit/admin/xmover/recommender.py diff --git a/cratedb_toolkit/admin/xmover/analyzer.py b/cratedb_toolkit/admin/xmover/analyzer.py index f9b8d6a9..311b1a33 100644 --- a/cratedb_toolkit/admin/xmover/analyzer.py +++ b/cratedb_toolkit/admin/xmover/analyzer.py @@ -5,67 +5,14 @@ import logging import math from collections import defaultdict -from dataclasses import dataclass from typing import Any, Dict, List, Optional, Set, Tuple, Union from .database import CrateDBClient -from .model import NodeInfo, ShardInfo +from .model import DistributionStats, MoveRecommendation, NodeInfo, RecommendationConstraints, ShardInfo logger = logging.getLogger(__name__) -@dataclass -class MoveRecommendation: - """Recommendation for moving a shard""" - - table_name: str - schema_name: str - shard_id: int - from_node: str - to_node: str - from_zone: str - to_zone: str - shard_type: str - size_gb: float - reason: str - - def to_sql(self) -> str: - """Generate the SQL command for this move""" - return ( - f'ALTER TABLE "{self.schema_name}"."{self.table_name}" ' - f"REROUTE MOVE SHARD {self.shard_id} " - f"FROM '{self.from_node}' TO '{self.to_node}';" - ) - - @property - def safety_score(self) -> float: - """Calculate a safety score for this move (0-1, higher is safer)""" - score = 1.0 - - # Penalize if moving to same zone (not ideal for zone distribution) - if self.from_zone == self.to_zone: - score -= 0.3 - - # Bonus for zone balancing moves - if "rebalancing" in self.reason.lower(): - score += 0.2 - - # Ensure score stays in valid range - return max(0.0, min(1.0, score)) - - -@dataclass -class DistributionStats: - """Statistics about shard distribution""" - - total_shards: int - total_size_gb: float - zones: Dict[str, int] - nodes: Dict[str, int] - zone_balance_score: float # 0-100, higher is better - node_balance_score: float # 0-100, higher is better - - class ShardAnalyzer: """Analyzer for CrateDB shard distribution and rebalancing""" @@ -227,18 +174,7 @@ def find_nodes_with_capacity( available_nodes.sort(key=lambda n: n.available_space_gb, reverse=True) return available_nodes - def generate_rebalancing_recommendations( - self, - table_name: Optional[str] = None, - min_size_gb: float = 40.0, - max_size_gb: float = 60.0, - zone_tolerance_percent: float = 10.0, - min_free_space_gb: float = 100.0, - max_recommendations: int = 10, - prioritize_space: bool = False, - source_node: Optional[str] = None, - max_disk_usage_percent: float = 90.0, - ) -> List[MoveRecommendation]: + def generate_rebalancing_recommendations(self, constraints: RecommendationConstraints) -> List[MoveRecommendation]: """Generate recommendations for rebalancing shards Args: @@ -250,15 +186,18 @@ def generate_rebalancing_recommendations( recommendations: List[MoveRecommendation] = [] # Get moveable shards (only healthy ones for actual operations) - moveable_shards = self.find_moveable_shards(min_size_gb, max_size_gb, table_name) + moveable_shards = self.find_moveable_shards(constraints.min_size, constraints.max_size, constraints.table_name) - print(f"Analyzing {len(moveable_shards)} candidate shards in size range {min_size_gb}-{max_size_gb}GB...") + print( + f"Analyzing {len(moveable_shards)} candidate shards " + f"in size range {constraints.min_size}-{constraints.max_size}GB..." + ) if not moveable_shards: return recommendations # Analyze current zone balance - zone_stats = self.check_zone_balance(table_name, zone_tolerance_percent) + zone_stats = self.check_zone_balance(constraints.table_name, constraints.zone_tolerance) # Calculate target distribution total_shards = sum(stats["TOTAL"] for stats in zone_stats.values()) @@ -271,8 +210,8 @@ def generate_rebalancing_recommendations( for zone, stats in zone_stats.items(): current_count = stats["TOTAL"] - threshold_high = target_per_zone * (1 + zone_tolerance_percent / 100) - threshold_low = target_per_zone * (1 - zone_tolerance_percent / 100) + threshold_high = target_per_zone * (1 + constraints.zone_tolerance / 100) + threshold_low = target_per_zone * (1 - constraints.zone_tolerance / 100) if current_count > threshold_high: overloaded_zones.append(zone) @@ -280,9 +219,9 @@ def generate_rebalancing_recommendations( underloaded_zones.append(zone) # Optimize processing: if filtering by source node, only process those shards - if source_node: - processing_shards = [s for s in moveable_shards if s.node_name == source_node] - print(f"Focusing on {len(processing_shards)} shards from node {source_node}") + if constraints.source_node: + processing_shards = [s for s in moveable_shards if s.node_name == constraints.source_node] + print(f"Focusing on {len(processing_shards)} shards from node {constraints.source_node}") else: processing_shards = moveable_shards @@ -295,7 +234,7 @@ def generate_rebalancing_recommendations( logger.info(f"Shard not found: {i}") continue - if len(recommendations) >= max_recommendations: + if len(recommendations) >= constraints.max_recommendations: logger.info(f"Found {len(recommendations)} recommendations for shard: {shard.shard_id}") break @@ -306,7 +245,7 @@ def generate_rebalancing_recommendations( total_evaluated += 1 # Skip based on priority mode - if not prioritize_space: + if not constraints.prioritize_space: # Zone balancing mode: only move shards from overloaded zones if shard.zone not in overloaded_zones: continue @@ -316,13 +255,13 @@ def generate_rebalancing_recommendations( target_nodes = self._find_nodes_with_capacity_cached( required_space_gb=shard.size_gb, exclude_nodes={shard.node_name}, # Don't move to same node - min_free_space_gb=min_free_space_gb, - max_disk_usage_percent=max_disk_usage_percent, + min_free_space_gb=constraints.min_free_space, + max_disk_usage_percent=constraints.max_disk_usage, ) # Quick pre-filter to avoid expensive safety validations # Only check nodes in different zones (for zone balancing) - if not prioritize_space: + if not constraints.prioritize_space: target_nodes = [node for node in target_nodes if node.zone != shard.zone] # Limit to top 3 candidates to reduce validation overhead @@ -346,7 +285,7 @@ def generate_rebalancing_recommendations( ) # Check if this move would be safe - is_safe, safety_msg = self.validate_move_safety(temp_rec, max_disk_usage_percent) + is_safe, safety_msg = self.validate_move_safety(temp_rec, constraints.max_disk_usage) if is_safe: safe_target_nodes.append(candidate_node) @@ -354,7 +293,7 @@ def generate_rebalancing_recommendations( continue # No safe targets found, skip this shard target_node: NodeInfo - if prioritize_space: + if constraints.prioritize_space: # Space priority mode: choose node with most available space target_node = safe_target_nodes[0] # Already sorted by available space (desc) else: @@ -384,7 +323,7 @@ def generate_rebalancing_recommendations( continue # No suitable target found # Determine the reason for the move - if prioritize_space: + if constraints.prioritize_space: if shard.zone == target_node.zone: reason = f"Space optimization within {shard.zone}" else: diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 2ad88d15..065b89be 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -1,9 +1,10 @@ """ -Command line interface for XMover - CrateDB Shard Analyzer and Movement Tool +XMover - CrateDB Shard Analyzer and Movement Tool + +Command Line Interface. """ import sys -import time from typing import Dict, List, Optional, cast import click @@ -12,9 +13,10 @@ from rich.panel import Panel from rich.table import Table -from cratedb_toolkit.admin.xmover.model import ShardInfo +from cratedb_toolkit.admin.xmover.model import MoveRecommendation, RecommendationConstraints, ShardInfo +from cratedb_toolkit.admin.xmover.recommender import Recommender -from .analyzer import MoveRecommendation, ShardAnalyzer +from .analyzer import ShardAnalyzer from .database import CrateDBClient from .recovery import RecoveryMonitor, RecoveryOptions from .util import format_percentage, format_size @@ -264,202 +266,34 @@ def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, def recommend( ctx, table: Optional[str], + node: Optional[str], min_size: float, max_size: float, zone_tolerance: float, min_free_space: float, max_moves: int, max_disk_usage: float, - validate: bool, prioritize_space: bool, + validate: bool, dry_run: bool, auto_execute: bool, - node: Optional[str], ): """Generate shard movement recommendations for rebalancing""" - client = ctx.obj["client"] - analyzer = ShardAnalyzer(client) - - # Safety check for auto-execute - if auto_execute and dry_run: - console.print("[red]❌ Error: --auto-execute requires --execute flag[/red]") - console.print("[dim]Use: --execute --auto-execute[/dim]") - return - - mode_text = "DRY RUN - Analysis Only" if dry_run else "EXECUTION MODE" - console.print( - Panel.fit( - f"[bold blue]Generating Rebalancing Recommendations[/bold blue] - " - f"[bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]" - ) - ) - console.print("[dim]Note: Only analyzing healthy shards (STARTED + 100% recovered) for safe operations[/dim]") - console.print("[dim]Zone conflict detection: Prevents moves that would violate CrateDB's zone awareness[/dim]") - if prioritize_space: - console.print("[dim]Mode: Prioritizing available space over zone balancing[/dim]") - else: - console.print("[dim]Mode: Prioritizing zone balancing over available space[/dim]") - - if node: - console.print(f"[dim]Filtering: Only showing moves from source node '{node}'[/dim]") - - console.print(f"[dim]Safety thresholds: Max disk usage {max_disk_usage}%, Min free space {min_free_space}GB[/dim]") - - if dry_run: - console.print("[green]Running in DRY RUN mode - no SQL commands will be generated[/green]") - else: - console.print("[red]EXECUTION MODE - SQL commands will be generated for actual moves[/red]") - console.print() - - recommendations = analyzer.generate_rebalancing_recommendations( - table_name=table, - min_size_gb=min_size, - max_size_gb=max_size, - zone_tolerance_percent=zone_tolerance, - min_free_space_gb=min_free_space, - max_recommendations=max_moves, - prioritize_space=prioritize_space, - source_node=node, - max_disk_usage_percent=max_disk_usage, + recommender = Recommender( + client=ctx.obj["client"], + constraints=RecommendationConstraints( + table_name=table, + source_node=node, + min_size=min_size, + max_size=max_size, + zone_tolerance=zone_tolerance, + min_free_space=min_free_space, + max_recommendations=max_moves, + max_disk_usage=max_disk_usage, + prioritize_space=prioritize_space, + ), ) - - if not recommendations: - if node: - console.print(f"[yellow]No safe recommendations found for node '{node}'[/yellow]") - console.print("[dim]This could be due to:[/dim]") - console.print("[dim] β€’ Zone conflicts preventing safe moves[/dim]") - console.print(f"[dim] β€’ Target nodes exceeding {max_disk_usage}% disk usage threshold[/dim]") - console.print(f"[dim] β€’ Insufficient free space on target nodes (need {min_free_space}GB)[/dim]") - console.print(f"[dim] β€’ No shards in size range {min_size}-{max_size}GB[/dim]") - console.print("[dim]Suggestions:[/dim]") - console.print("[dim] β€’ Try: --max-disk-usage 95 (allow higher disk usage)[/dim]") - console.print("[dim] β€’ Try: --min-free-space 50 (reduce space requirements)[/dim]") - console.print("[dim] β€’ Try: different size ranges or remove --node filter[/dim]") - else: - console.print("[green]No rebalancing recommendations needed. Cluster appears well balanced![/green]") - return - - # Show recommendations table - rec_table = Table(title=f"Rebalancing Recommendations ({len(recommendations)} moves)", box=box.ROUNDED) - rec_table.add_column("Table", style="cyan") - rec_table.add_column("Shard", justify="right", style="magenta") - rec_table.add_column("Type", style="blue") - rec_table.add_column("From Node", style="red") - rec_table.add_column("To Node", style="green") - rec_table.add_column("Target Free Space", justify="right", style="cyan") - rec_table.add_column("Zone Change", style="yellow") - rec_table.add_column("Size", justify="right", style="white") - rec_table.add_column("Reason", style="dim") - if validate: - rec_table.add_column("Safety Check", style="bold") - - # Create a mapping of node names to available space for display - node_space_map = {node.name: node.available_space_gb for node in analyzer.nodes} - - for rec in recommendations: - zone_change = f"{rec.from_zone} β†’ {rec.to_zone}" if rec.from_zone != rec.to_zone else rec.from_zone - target_free_space = node_space_map.get(rec.to_node, 0) - - row = [ - f"{rec.schema_name}.{rec.table_name}", - str(rec.shard_id), - rec.shard_type, - rec.from_node, - rec.to_node, - format_size(target_free_space), - zone_change, - format_size(rec.size_gb), - rec.reason, - ] - - if validate: - is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=max_disk_usage) - safety_status = "[green]βœ“ SAFE[/green]" if is_safe else f"[red]βœ— {safety_msg}[/red]" - row.append(safety_status) - - rec_table.add_row(*row) - - console.print(rec_table) - console.print() - - # Generate SQL commands or show dry-run analysis - if dry_run: - console.print(Panel.fit("[bold yellow]Dry Run Analysis - No Commands Generated[/bold yellow]")) - console.print("[dim]# This is a dry run - showing what would be recommended[/dim]") - console.print("[dim]# Use --execute flag to generate actual SQL commands[/dim]") - console.print() - - safe_moves = 0 - zone_conflicts = 0 - space_issues = 0 - - for i, rec in enumerate(recommendations, 1): - if validate: - is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=max_disk_usage) - if not is_safe: - if "zone conflict" in safety_msg.lower(): - zone_conflicts += 1 - console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") - elif "space" in safety_msg.lower(): - space_issues += 1 - console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") - else: - console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") - continue - safe_moves += 1 - - console.print(f"[green]βœ“ Move {i}: WOULD EXECUTE - {rec.reason}[/green]") - console.print(f"[dim] Target SQL: {rec.to_sql()}[/dim]") - - console.print() - console.print("[bold]Dry Run Summary:[/bold]") - console.print(f" β€’ Safe moves that would execute: [green]{safe_moves}[/green]") - console.print(f" β€’ Zone conflicts prevented: [yellow]{zone_conflicts}[/yellow]") - console.print(f" β€’ Space-related issues: [yellow]{space_issues}[/yellow]") - if safe_moves > 0: - console.print( - f"\n[green]βœ“ Ready to execute {safe_moves} safe moves. Use --execute to generate SQL commands.[/green]" - ) - else: - console.print("\n[yellow]⚠ No safe moves identified. Review cluster balance or adjust parameters.[/yellow]") - else: - console.print(Panel.fit("[bold green]Generated SQL Commands[/bold green]")) - console.print("[dim]# Copy and paste these commands to execute the moves[/dim]") - console.print("[dim]# ALWAYS test in a non-production environment first![/dim]") - console.print("[dim]# These commands only operate on healthy shards (STARTED + fully recovered)[/dim]") - console.print("[dim]# Commands use quoted identifiers for schema and table names[/dim]") - console.print() - - safe_moves = 0 - zone_conflicts = 0 - for i, rec in enumerate(recommendations, 1): - if validate: - is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=max_disk_usage) - if not is_safe: - if "Zone conflict" in safety_msg: - zone_conflicts += 1 - console.print(f"-- Move {i}: SKIPPED - {safety_msg}") - console.print("-- Tip: Try moving to a different zone or check existing shard distribution") - else: - console.print(f"-- Move {i}: SKIPPED - {safety_msg}") - continue - safe_moves += 1 - - console.print(f"-- Move {i}: {rec.reason}") - console.print(f"{rec.to_sql()}") - console.print() - - # Auto-execution if requested - if auto_execute: - _execute_recommendations_safely(client, recommendations, validate) - - if validate and safe_moves < len(recommendations): - if zone_conflicts > 0: - console.print(f"[yellow]Warning: {zone_conflicts} moves skipped due to zone conflicts[/yellow]") - console.print("[yellow]Tip: Use 'find-candidates' to see current shard distribution across zones[/yellow]") - console.print( - f"[yellow]Warning: Only {safe_moves} of {len(recommendations)} moves passed safety validation[/yellow]" - ) + recommender.start(auto_execute=auto_execute, validate=validate, dry_run=dry_run) @main.command() @@ -968,150 +802,5 @@ def monitor_recovery( recovery_monitor.start(watch=watch, debug=ctx.obj.get("debug")) -def _wait_for_recovery_capacity(client, max_concurrent_recoveries: int = 5): - """Wait until active recovery count is below threshold""" - - recovery_monitor = RecoveryMonitor(client, RecoveryOptions(include_transitioning=True)) - wait_time = 0 - - while True: - # Check active recoveries (including transitioning) - recoveries = recovery_monitor.get_cluster_recovery_status() - active_count = len([r for r in recoveries if r.overall_progress < 100.0 or r.stage != "DONE"]) - status = f"{active_count}/{max_concurrent_recoveries}" - if active_count < max_concurrent_recoveries: - if wait_time > 0: - console.print(f" [green]βœ“ Recovery capacity available ({status} active)[/green]") - break - if wait_time == 0: - console.print(f" [yellow]⏳ Waiting for recovery capacity... ({status} active)[/yellow]") - elif wait_time % 30 == 0: # Update every 30 seconds - console.print(f" [yellow]⏳ Still waiting... ({status} active)[/yellow]") - - time.sleep(10) # Check every 10 seconds - wait_time += 10 - - -def _execute_recommendations_safely(client, recommendations, validate: bool): - """Execute recommendations with extensive safety measures""" - - # Filter to only safe recommendations - safe_recommendations = [] - if validate: - analyzer = ShardAnalyzer(client) - for rec in recommendations: - is_safe, safety_msg = analyzer.validate_move_safety(rec, max_disk_usage_percent=95.0) - if is_safe: - safe_recommendations.append(rec) - else: - safe_recommendations = recommendations - - if not safe_recommendations: - console.print("[yellow]⚠ No safe recommendations to execute[/yellow]") - return - - console.print("\n[bold red]🚨 AUTO-EXECUTION MODE 🚨[/bold red]") - console.print(f"About to execute {len(safe_recommendations)} shard moves automatically:") - console.print() - - # Show what will be executed - for i, rec in enumerate(safe_recommendations, 1): - table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name - console.print(f" {i}. {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB) {rec.from_node} β†’ {rec.to_node}") - - console.print() - console.print("[bold yellow]⚠ SAFETY WARNINGS:[/bold yellow]") - console.print(" β€’ These commands will immediately start shard movements") - console.print(" β€’ Each move will temporarily impact cluster performance") - console.print(" β€’ Recovery time depends on shard size and network speed") - console.print(" β€’ You should monitor progress with: xmover monitor-recovery --watch") - console.print() - - # Double confirmation - try: - response1 = input("Type 'EXECUTE' to proceed with automatic execution: ").strip() - if response1 != "EXECUTE": - console.print("[yellow]❌ Execution cancelled[/yellow]") - return - - response2 = input(f"Confirm: Execute {len(safe_recommendations)} shard moves? (yes/no): ").strip().lower() - if response2 not in ["yes", "y"]: - console.print("[yellow]❌ Execution cancelled[/yellow]") - return - - except KeyboardInterrupt: - console.print("\n[yellow]❌ Execution cancelled by user[/yellow]") - return - - console.print(f"\nπŸš€ [bold green]Executing {len(safe_recommendations)} shard moves...[/bold green]") - console.print() - - successful_moves = 0 - failed_moves = 0 - - for i, rec in enumerate(safe_recommendations, 1): - table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name - sql_command = rec.to_sql() - - console.print( - f"[{i}/{len(safe_recommendations)}] Executing: {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB)" - ) - console.print(f" {rec.from_node} β†’ {rec.to_node}") - - try: - # Execute the SQL command - result = client.execute_query(sql_command) - - if result.get("rowcount", 0) >= 0: # Success indicator for ALTER statements - console.print(" [green]βœ… SUCCESS[/green] - Move initiated") - successful_moves += 1 - - # Smart delay: check active recoveries before next move - if i < len(safe_recommendations): - _wait_for_recovery_capacity(client, max_concurrent_recoveries=5) - else: - console.print(f" [red]❌ FAILED[/red] - Unexpected result: {result}") - failed_moves += 1 - - except Exception as e: - console.print(f" [red]❌ FAILED[/red] - Error: {e}") - failed_moves += 1 - - # Ask whether to continue after a failure - if i < len(safe_recommendations): - try: - continue_response = ( - input(f" Continue with remaining {len(safe_recommendations) - i} moves? (yes/no): ") - .strip() - .lower() - ) - if continue_response not in ["yes", "y"]: - console.print("[yellow]⏹ Execution stopped by user[/yellow]") - break - except KeyboardInterrupt: - console.print("\n[yellow]⏹ Execution stopped by user[/yellow]") - break - - console.print() - - # Final summary - console.print("πŸ“Š [bold]Execution Summary:[/bold]") - console.print(f" Successful moves: [green]{successful_moves}[/green]") - console.print(f" Failed moves: [red]{failed_moves}[/red]") - console.print(f" Total attempted: {successful_moves + failed_moves}") - - if successful_moves > 0: - console.print() - console.print("[green]βœ… Shard moves initiated successfully![/green]") - console.print("[dim]πŸ’‘ Monitor progress with:[/dim]") - console.print("[dim] xmover monitor-recovery --watch[/dim]") - console.print("[dim]πŸ’‘ Check cluster status with:[/dim]") - console.print("[dim] xmover analyze[/dim]") - - if failed_moves > 0: - console.print() - console.print(f"[yellow]⚠ {failed_moves} moves failed - check cluster status and retry if needed[/yellow]") - - if __name__ == "__main__": main() diff --git a/cratedb_toolkit/admin/xmover/model.py b/cratedb_toolkit/admin/xmover/model.py index 4d271445..12286597 100644 --- a/cratedb_toolkit/admin/xmover/model.py +++ b/cratedb_toolkit/admin/xmover/model.py @@ -1,5 +1,6 @@ +import dataclasses from dataclasses import dataclass -from typing import Optional +from typing import Dict, Optional @dataclass @@ -99,3 +100,68 @@ def translog_size_gb(self) -> float: def translog_percentage(self) -> float: """Translog size as percentage of shard size""" return (self.translog_size_bytes / self.size_bytes * 100) if self.size_bytes > 0 else 0 + + +@dataclass +class MoveRecommendation: + """Recommendation for moving a shard""" + + table_name: str + schema_name: str + shard_id: int + from_node: str + to_node: str + from_zone: str + to_zone: str + shard_type: str + size_gb: float + reason: str + + def to_sql(self) -> str: + """Generate the SQL command for this move""" + return ( + f'ALTER TABLE "{self.schema_name}"."{self.table_name}" ' + f"REROUTE MOVE SHARD {self.shard_id} " + f"FROM '{self.from_node}' TO '{self.to_node}';" + ) + + @property + def safety_score(self) -> float: + """Calculate a safety score for this move (0-1, higher is safer)""" + score = 1.0 + + # Penalize if moving to same zone (not ideal for zone distribution) + if self.from_zone == self.to_zone: + score -= 0.3 + + # Bonus for zone balancing moves + if "rebalancing" in self.reason.lower(): + score += 0.2 + + # Ensure score stays in valid range + return max(0.0, min(1.0, score)) + + +@dataclass +class DistributionStats: + """Statistics about shard distribution""" + + total_shards: int + total_size_gb: float + zones: Dict[str, int] + nodes: Dict[str, int] + zone_balance_score: float # 0-100, higher is better + node_balance_score: float # 0-100, higher is better + + +@dataclasses.dataclass +class RecommendationConstraints: + min_size: float = 40.0 + max_size: float = 60.0 + table_name: Optional[str] = None + source_node: Optional[str] = None + zone_tolerance: float = 10.0 + min_free_space: float = 100.0 + max_recommendations: int = 10 + max_disk_usage: float = 90.0 + prioritize_space: bool = False diff --git a/cratedb_toolkit/admin/xmover/recommender.py b/cratedb_toolkit/admin/xmover/recommender.py new file mode 100644 index 00000000..7e780600 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/recommender.py @@ -0,0 +1,366 @@ +import time + +from rich import box +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from .analyzer import ShardAnalyzer +from .database import CrateDBClient +from .model import RecommendationConstraints +from .recovery import RecoveryMonitor, RecoveryOptions +from .util import format_size + +console = Console() + + +class Recommender: + def __init__(self, client: CrateDBClient, constraints: RecommendationConstraints): + self.client = client + self.constraints = constraints + self.analyzer = ShardAnalyzer(self.client) + + def start( + self, + auto_execute: bool, + validate: bool, + dry_run: bool, + ): + # Safety check for auto-execute + if auto_execute and dry_run: + console.print("[red]❌ Error: --auto-execute requires --execute flag[/red]") + console.print("[dim]Use: --execute --auto-execute[/dim]") + return + + mode_text = "DRY RUN - Analysis Only" if dry_run else "EXECUTION MODE" + console.print( + Panel.fit( + f"[bold blue]Generating Rebalancing Recommendations[/bold blue] - " + f"[bold {'green' if dry_run else 'red'}]{mode_text}[/bold {'green' if dry_run else 'red'}]" + ) + ) + console.print("[dim]Note: Only analyzing healthy shards (STARTED + 100% recovered) for safe operations[/dim]") + console.print("[dim]Zone conflict detection: Prevents moves that would violate CrateDB's zone awareness[/dim]") + if self.constraints.prioritize_space: + console.print("[dim]Mode: Prioritizing available space over zone balancing[/dim]") + else: + console.print("[dim]Mode: Prioritizing zone balancing over available space[/dim]") + + if self.constraints.source_node: + console.print(f"[dim]Filtering: Only showing moves from source node '{self.constraints.source_node}'[/dim]") + + console.print( + f"[dim]Safety thresholds: Max disk usage {self.constraints.max_disk_usage}%, " + f"Min free space {self.constraints.min_free_space}GB[/dim]" + ) + + if dry_run: + console.print("[green]Running in DRY RUN mode - no SQL commands will be generated[/green]") + else: + console.print("[red]EXECUTION MODE - SQL commands will be generated for actual moves[/red]") + console.print() + + recommendations = self.analyzer.generate_rebalancing_recommendations(constraints=self.constraints) + + if not recommendations: + if self.constraints.source_node: + console.print( + f"[yellow]No safe recommendations found for node '{self.constraints.source_node}'[/yellow]" + ) + console.print("[dim]This could be due to:[/dim]") + console.print("[dim] β€’ Zone conflicts preventing safe moves[/dim]") + console.print( + f"[dim] β€’ Target nodes exceeding {self.constraints.max_disk_usage}% disk usage threshold[/dim]" + ) + console.print( + f"[dim] β€’ Insufficient free space on target nodes (need {self.constraints.min_free_space}GB)[/dim]" + ) + console.print( + f"[dim] β€’ No shards in size range {self.constraints.min_size}-{self.constraints.max_size}GB[/dim]" + ) + console.print("[dim]Suggestions:[/dim]") + console.print("[dim] β€’ Try: --max-disk-usage 95 (allow higher disk usage)[/dim]") + console.print("[dim] β€’ Try: --min-free-space 50 (reduce space requirements)[/dim]") + console.print("[dim] β€’ Try: different size ranges or remove --node filter[/dim]") + else: + console.print("[green]No rebalancing recommendations needed. Cluster appears well balanced![/green]") + return + + # Show recommendations table + rec_table = Table(title=f"Rebalancing Recommendations ({len(recommendations)} moves)", box=box.ROUNDED) + rec_table.add_column("Table", style="cyan") + rec_table.add_column("Shard", justify="right", style="magenta") + rec_table.add_column("Type", style="blue") + rec_table.add_column("From Node", style="red") + rec_table.add_column("To Node", style="green") + rec_table.add_column("Target Free Space", justify="right", style="cyan") + rec_table.add_column("Zone Change", style="yellow") + rec_table.add_column("Size", justify="right", style="white") + rec_table.add_column("Reason", style="dim") + if validate: + rec_table.add_column("Safety Check", style="bold") + + # Create a mapping of node names to available space for display + node_space_map = {node.name: node.available_space_gb for node in self.analyzer.nodes} + + for rec in recommendations: + zone_change = f"{rec.from_zone} β†’ {rec.to_zone}" if rec.from_zone != rec.to_zone else rec.from_zone + target_free_space = node_space_map.get(rec.to_node, 0) + + row = [ + f"{rec.schema_name}.{rec.table_name}", + str(rec.shard_id), + rec.shard_type, + rec.from_node, + rec.to_node, + format_size(target_free_space), + zone_change, + format_size(rec.size_gb), + rec.reason, + ] + + if validate: + is_safe, safety_msg = self.analyzer.validate_move_safety( + rec, max_disk_usage_percent=self.constraints.max_disk_usage + ) + safety_status = "[green]βœ“ SAFE[/green]" if is_safe else f"[red]βœ— {safety_msg}[/red]" + row.append(safety_status) + + rec_table.add_row(*row) + + console.print(rec_table) + console.print() + + # Generate SQL commands or show dry-run analysis + if dry_run: + console.print(Panel.fit("[bold yellow]Dry Run Analysis - No Commands Generated[/bold yellow]")) + console.print("[dim]# This is a dry run - showing what would be recommended[/dim]") + console.print("[dim]# Use --execute flag to generate actual SQL commands[/dim]") + console.print() + + safe_moves = 0 + zone_conflicts = 0 + space_issues = 0 + + for i, rec in enumerate(recommendations, 1): + if validate: + is_safe, safety_msg = self.analyzer.validate_move_safety( + rec, max_disk_usage_percent=self.constraints.max_disk_usage + ) + if not is_safe: + if "zone conflict" in safety_msg.lower(): + zone_conflicts += 1 + console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") + elif "space" in safety_msg.lower(): + space_issues += 1 + console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") + else: + console.print(f"[yellow]⚠ Move {i}: WOULD BE SKIPPED - {safety_msg}[/yellow]") + continue + safe_moves += 1 + + console.print(f"[green]βœ“ Move {i}: WOULD EXECUTE - {rec.reason}[/green]") + console.print(f"[dim] Target SQL: {rec.to_sql()}[/dim]") + + console.print() + console.print("[bold]Dry Run Summary:[/bold]") + console.print(f" β€’ Safe moves that would execute: [green]{safe_moves}[/green]") + console.print(f" β€’ Zone conflicts prevented: [yellow]{zone_conflicts}[/yellow]") + console.print(f" β€’ Space-related issues: [yellow]{space_issues}[/yellow]") + if safe_moves > 0: + console.print( + f"\n[green]βœ“ Ready to execute {safe_moves} safe moves. " + f"Use --execute to generate SQL commands.[/green]" + ) + else: + console.print( + "\n[yellow]⚠ No safe moves identified. Review cluster balance or adjust parameters.[/yellow]" + ) + else: + console.print(Panel.fit("[bold green]Generated SQL Commands[/bold green]")) + console.print("[dim]# Copy and paste these commands to execute the moves[/dim]") + console.print("[dim]# ALWAYS test in a non-production environment first![/dim]") + console.print("[dim]# These commands only operate on healthy shards (STARTED + fully recovered)[/dim]") + console.print("[dim]# Commands use quoted identifiers for schema and table names[/dim]") + console.print() + + safe_moves = 0 + zone_conflicts = 0 + for i, rec in enumerate(recommendations, 1): + if validate: + is_safe, safety_msg = self.analyzer.validate_move_safety( + rec, max_disk_usage_percent=self.constraints.max_disk_usage + ) + if not is_safe: + if "Zone conflict" in safety_msg: + zone_conflicts += 1 + console.print(f"-- Move {i}: SKIPPED - {safety_msg}") + console.print( + "-- Tip: Try moving to a different zone or check existing shard distribution" + ) + else: + console.print(f"-- Move {i}: SKIPPED - {safety_msg}") + continue + safe_moves += 1 + + console.print(f"-- Move {i}: {rec.reason}") + console.print(f"{rec.to_sql()}") + console.print() + + # Auto-execution if requested + if auto_execute: + self._execute_recommendations_safely(recommendations, validate) + + if validate and safe_moves < len(recommendations): + if zone_conflicts > 0: + console.print(f"[yellow]Warning: {zone_conflicts} moves skipped due to zone conflicts[/yellow]") + console.print( + "[yellow]Tip: Use 'find-candidates' to see current shard distribution across zones[/yellow]" + ) + console.print( + f"[yellow]Warning: Only {safe_moves} of {len(recommendations)} moves passed safety validation[/yellow]" + ) + + def _execute_recommendations_safely(self, recommendations, validate: bool): + """Execute recommendations with extensive safety measures""" + + # Filter to only safe recommendations + safe_recommendations = [] + if validate: + for rec in recommendations: + is_safe, safety_msg = self.analyzer.validate_move_safety(rec, max_disk_usage_percent=95.0) + if is_safe: + safe_recommendations.append(rec) + else: + safe_recommendations = recommendations + + if not safe_recommendations: + console.print("[yellow]⚠ No safe recommendations to execute[/yellow]") + return + + console.print("\n[bold red]🚨 AUTO-EXECUTION MODE 🚨[/bold red]") + console.print(f"About to execute {len(safe_recommendations)} shard moves automatically:") + console.print() + + # Show what will be executed + for i, rec in enumerate(safe_recommendations, 1): + table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name + console.print( + f" {i}. {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB) {rec.from_node} β†’ {rec.to_node}" + ) + + console.print() + console.print("[bold yellow]⚠ SAFETY WARNINGS:[/bold yellow]") + console.print(" β€’ These commands will immediately start shard movements") + console.print(" β€’ Each move will temporarily impact cluster performance") + console.print(" β€’ Recovery time depends on shard size and network speed") + console.print(" β€’ You should monitor progress with: xmover monitor-recovery --watch") + console.print() + + # Double confirmation + try: + response1 = input("Type 'EXECUTE' to proceed with automatic execution: ").strip() + if response1 != "EXECUTE": + console.print("[yellow]❌ Execution cancelled[/yellow]") + return + + response2 = input(f"Confirm: Execute {len(safe_recommendations)} shard moves? (yes/no): ").strip().lower() + if response2 not in ["yes", "y"]: + console.print("[yellow]❌ Execution cancelled[/yellow]") + return + + except KeyboardInterrupt: + console.print("\n[yellow]❌ Execution cancelled by user[/yellow]") + return + + console.print(f"\nπŸš€ [bold green]Executing {len(safe_recommendations)} shard moves...[/bold green]") + console.print() + + successful_moves = 0 + failed_moves = 0 + + for i, rec in enumerate(safe_recommendations, 1): + table_display = f"{rec.schema_name}.{rec.table_name}" if rec.schema_name != "doc" else rec.table_name + sql_command = rec.to_sql() + + console.print( + f"[{i}/{len(safe_recommendations)}] Executing: {table_display} S{rec.shard_id} ({rec.size_gb:.1f}GB)" + ) + console.print(f" {rec.from_node} β†’ {rec.to_node}") + + try: + # Execute the SQL command + result = self.client.execute_query(sql_command) + + if result.get("rowcount", 0) >= 0: # Success indicator for ALTER statements + console.print(" [green]βœ… SUCCESS[/green] - Move initiated") + successful_moves += 1 + + # Smart delay: check active recoveries before next move + if i < len(safe_recommendations): + self._wait_for_recovery_capacity(max_concurrent_recoveries=5) + else: + console.print(f" [red]❌ FAILED[/red] - Unexpected result: {result}") + failed_moves += 1 + + except Exception as e: + console.print(f" [red]❌ FAILED[/red] - Error: {e}") + failed_moves += 1 + + # Ask whether to continue after a failure + if i < len(safe_recommendations): + try: + continue_response = ( + input(f" Continue with remaining {len(safe_recommendations) - i} moves? (yes/no): ") + .strip() + .lower() + ) + if continue_response not in ["yes", "y"]: + console.print("[yellow]⏹ Execution stopped by user[/yellow]") + break + except KeyboardInterrupt: + console.print("\n[yellow]⏹ Execution stopped by user[/yellow]") + break + + console.print() + + # Final summary + console.print("πŸ“Š [bold]Execution Summary:[/bold]") + console.print(f" Successful moves: [green]{successful_moves}[/green]") + console.print(f" Failed moves: [red]{failed_moves}[/red]") + console.print(f" Total attempted: {successful_moves + failed_moves}") + + if successful_moves > 0: + console.print() + console.print("[green]βœ… Shard moves initiated successfully![/green]") + console.print("[dim]πŸ’‘ Monitor progress with:[/dim]") + console.print("[dim] xmover monitor-recovery --watch[/dim]") + console.print("[dim]πŸ’‘ Check cluster status with:[/dim]") + console.print("[dim] xmover analyze[/dim]") + + if failed_moves > 0: + console.print() + console.print(f"[yellow]⚠ {failed_moves} moves failed - check cluster status and retry if needed[/yellow]") + + def _wait_for_recovery_capacity(self, max_concurrent_recoveries: int = 5): + """Wait until active recovery count is below threshold""" + + recovery_monitor = RecoveryMonitor(self.client, RecoveryOptions(include_transitioning=True)) + wait_time = 0 + + while True: + # Check active recoveries (including transitioning) + recoveries = recovery_monitor.get_cluster_recovery_status() + active_count = len([r for r in recoveries if r.overall_progress < 100.0 or r.stage != "DONE"]) + status = f"{active_count}/{max_concurrent_recoveries}" + if active_count < max_concurrent_recoveries: + if wait_time > 0: + console.print(f" [green]βœ“ Recovery capacity available ({status} active)[/green]") + break + if wait_time == 0: + console.print(f" [yellow]⏳ Waiting for recovery capacity... ({status} active)[/yellow]") + elif wait_time % 30 == 0: # Update every 30 seconds + console.print(f" [yellow]⏳ Still waiting... ({status} active)[/yellow]") + + time.sleep(10) # Check every 10 seconds + wait_time += 10 From 2dc5b913614ae75d934f8db9b5bb331a5c26d116 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 20:16:10 +0200 Subject: [PATCH 08/18] Admin/XMover: Refactor -- "reporter" --- cratedb_toolkit/admin/xmover/analyzer.py | 22 +- cratedb_toolkit/admin/xmover/cli.py | 323 +++------------------- cratedb_toolkit/admin/xmover/model.py | 21 +- cratedb_toolkit/admin/xmover/reporter.py | 325 +++++++++++++++++++++++ 4 files changed, 387 insertions(+), 304 deletions(-) create mode 100644 cratedb_toolkit/admin/xmover/reporter.py diff --git a/cratedb_toolkit/admin/xmover/analyzer.py b/cratedb_toolkit/admin/xmover/analyzer.py index 311b1a33..98af6a21 100644 --- a/cratedb_toolkit/admin/xmover/analyzer.py +++ b/cratedb_toolkit/admin/xmover/analyzer.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union from .database import CrateDBClient -from .model import DistributionStats, MoveRecommendation, NodeInfo, RecommendationConstraints, ShardInfo +from .model import DistributionStats, NodeInfo, RecommendationConstraints, ShardInfo, ShardMoveRecommendation logger = logging.getLogger(__name__) @@ -174,7 +174,9 @@ def find_nodes_with_capacity( available_nodes.sort(key=lambda n: n.available_space_gb, reverse=True) return available_nodes - def generate_rebalancing_recommendations(self, constraints: RecommendationConstraints) -> List[MoveRecommendation]: + def generate_rebalancing_recommendations( + self, constraints: RecommendationConstraints + ) -> List[ShardMoveRecommendation]: """Generate recommendations for rebalancing shards Args: @@ -183,7 +185,7 @@ def generate_rebalancing_recommendations(self, constraints: RecommendationConstr source_node: If specified, only generate recommendations for shards on this node max_disk_usage_percent: Maximum disk usage percentage for target nodes """ - recommendations: List[MoveRecommendation] = [] + recommendations: List[ShardMoveRecommendation] = [] # Get moveable shards (only healthy ones for actual operations) moveable_shards = self.find_moveable_shards(constraints.min_size, constraints.max_size, constraints.table_name) @@ -271,7 +273,7 @@ def generate_rebalancing_recommendations(self, constraints: RecommendationConstr safe_target_nodes = [] for candidate_node in target_nodes: # Create a temporary recommendation to test safety - temp_rec = MoveRecommendation( + temp_rec = ShardMoveRecommendation( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, @@ -333,7 +335,7 @@ def generate_rebalancing_recommendations(self, constraints: RecommendationConstr if shard.zone == target_node.zone: reason = f"Node balancing within {shard.zone}" - recommendation = MoveRecommendation( + recommendation = ShardMoveRecommendation( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, @@ -355,7 +357,7 @@ def generate_rebalancing_recommendations(self, constraints: RecommendationConstr return recommendations def validate_move_safety( - self, recommendation: MoveRecommendation, max_disk_usage_percent: float = 90.0 + self, recommendation: ShardMoveRecommendation, max_disk_usage_percent: float = 90.0 ) -> Tuple[bool, str]: """Validate that a move recommendation is safe to execute""" # Find target node (with caching) @@ -401,7 +403,7 @@ def _get_node_cached(self, node_name: str): self._node_lookup_cache[node_name] = target_node return target_node - def _check_zone_conflict_cached(self, recommendation: MoveRecommendation) -> Optional[str]: + def _check_zone_conflict_cached(self, recommendation: ShardMoveRecommendation) -> Optional[str]: """Check zone conflicts with caching""" # Create cache key: table, shard, target zone target_zone = self._get_node_zone(recommendation.to_node) @@ -459,7 +461,7 @@ def _find_nodes_with_capacity_cached( self._target_nodes_cache[cache_key] = result return result - def _check_zone_conflict(self, recommendation: MoveRecommendation) -> Optional[str]: + def _check_zone_conflict(self, recommendation: ShardMoveRecommendation) -> Optional[str]: """Check if moving this shard would create a zone conflict Performs comprehensive zone safety analysis: @@ -745,7 +747,7 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100. safe_targets = [] for target in potential_targets: # Create a temporary recommendation to test zone safety - temp_rec = MoveRecommendation( + temp_rec = ShardMoveRecommendation( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, @@ -770,7 +772,7 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100. # Choose the target with most available space best_target = safe_targets[0] move_plan.append( - MoveRecommendation( + ShardMoveRecommendation( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 065b89be..76143895 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -13,13 +13,18 @@ from rich.panel import Panel from rich.table import Table -from cratedb_toolkit.admin.xmover.model import MoveRecommendation, RecommendationConstraints, ShardInfo +from cratedb_toolkit.admin.xmover.model import ( + RecommendationConstraints, + ShardInfo, + ShardMoveRequest, + SizeCriteria, +) from cratedb_toolkit.admin.xmover.recommender import Recommender +from cratedb_toolkit.admin.xmover.reporter import ShardReporter from .analyzer import ShardAnalyzer from .database import CrateDBClient from .recovery import RecoveryMonitor, RecoveryOptions -from .util import format_percentage, format_size console = Console() @@ -55,185 +60,31 @@ def analyze(ctx, table: Optional[str]): """Analyze current shard distribution across nodes and zones""" client = ctx.obj["client"] analyzer = ShardAnalyzer(client) - - console.print(Panel.fit("[bold blue]CrateDB Cluster Analysis[/bold blue]")) - - # Get cluster overview (includes all shards for complete analysis) - overview = analyzer.get_cluster_overview() - - # Cluster summary table - summary_table = Table(title="Cluster Summary", box=box.ROUNDED) - summary_table.add_column("Metric", style="cyan") - summary_table.add_column("Value", style="magenta") - - summary_table.add_row("Nodes", str(overview["nodes"])) - summary_table.add_row("Availability Zones", str(overview["zones"])) - summary_table.add_row("Total Shards", str(overview["total_shards"])) - summary_table.add_row("Primary Shards", str(overview["primary_shards"])) - summary_table.add_row("Replica Shards", str(overview["replica_shards"])) - summary_table.add_row("Total Size", format_size(overview["total_size_gb"])) - - console.print(summary_table) - console.print() - - # Disk watermarks table - if overview.get("watermarks"): - watermarks_table = Table(title="Disk Allocation Watermarks", box=box.ROUNDED) - watermarks_table.add_column("Setting", style="cyan") - watermarks_table.add_column("Value", style="magenta") - - watermarks = overview["watermarks"] - watermarks_table.add_row("Low Watermark", str(watermarks.get("low", "Not set"))) - watermarks_table.add_row("High Watermark", str(watermarks.get("high", "Not set"))) - watermarks_table.add_row("Flood Stage", str(watermarks.get("flood_stage", "Not set"))) - watermarks_table.add_row( - "Enable for Single Node", str(watermarks.get("enable_for_single_data_node", "Not set")) - ) - - console.print(watermarks_table) - console.print() - - # Zone distribution table - zone_table = Table(title="Zone Distribution", box=box.ROUNDED) - zone_table.add_column("Zone", style="cyan") - zone_table.add_column("Shards", justify="right", style="magenta") - zone_table.add_column("Percentage", justify="right", style="green") - - total_shards = overview["total_shards"] - for zone, count in overview["zone_distribution"].items(): - percentage = (count / total_shards * 100) if total_shards > 0 else 0 - zone_table.add_row(zone, str(count), f"{percentage:.1f}%") - - console.print(zone_table) - console.print() - - # Node health table - node_table = Table(title="Node Health", box=box.ROUNDED) - node_table.add_column("Node", style="cyan") - node_table.add_column("Zone", style="blue") - node_table.add_column("Shards", justify="right", style="magenta") - node_table.add_column("Size", justify="right", style="green") - node_table.add_column("Disk Usage", justify="right") - node_table.add_column("Available Space", justify="right", style="green") - node_table.add_column("Until Low WM", justify="right", style="yellow") - node_table.add_column("Until High WM", justify="right", style="red") - - for node_info in overview["node_health"]: - # Format watermark remaining capacity - low_wm_remaining = ( - format_size(node_info["remaining_to_low_watermark_gb"]) - if node_info["remaining_to_low_watermark_gb"] > 0 - else "[red]Exceeded[/red]" - ) - high_wm_remaining = ( - format_size(node_info["remaining_to_high_watermark_gb"]) - if node_info["remaining_to_high_watermark_gb"] > 0 - else "[red]Exceeded[/red]" - ) - - node_table.add_row( - node_info["name"], - node_info["zone"], - str(node_info["shards"]), - format_size(node_info["size_gb"]), - format_percentage(node_info["disk_usage_percent"]), - format_size(node_info["available_space_gb"]), - low_wm_remaining, - high_wm_remaining, - ) - - console.print(node_table) - - # Table-specific analysis if requested - if table: - console.print() - console.print(Panel.fit(f"[bold blue]Analysis for table: {table}[/bold blue]")) - - stats = analyzer.analyze_distribution(table) - - table_summary = Table(title=f"Table {table} Distribution", box=box.ROUNDED) - table_summary.add_column("Metric", style="cyan") - table_summary.add_column("Value", style="magenta") - - table_summary.add_row("Total Shards", str(stats.total_shards)) - table_summary.add_row("Total Size", format_size(stats.total_size_gb)) - table_summary.add_row("Zone Balance Score", f"{stats.zone_balance_score:.1f}/100") - table_summary.add_row("Node Balance Score", f"{stats.node_balance_score:.1f}/100") - - console.print(table_summary) + reporter = ShardReporter(analyzer) + reporter.distribution(table=table) @main.command() -@click.option("--table", "-t", help="Find candidates for specific table only") @click.option("--min-size", default=40.0, help="Minimum shard size in GB (default: 40)") @click.option("--max-size", default=60.0, help="Maximum shard size in GB (default: 60)") @click.option("--limit", default=20, help="Maximum number of candidates to show (default: 20)") +@click.option("--table", "-t", help="Find candidates for specific table only") @click.option("--node", help="Only show candidates from this specific source node (e.g., data-hot-4)") @click.pass_context -def find_candidates(ctx, table: Optional[str], min_size: float, max_size: float, limit: int, node: Optional[str]): - """Find shard candidates for movement based on size criteria - - Results are sorted by nodes with least available space first, - then by shard size (smallest first) for easier moves. - """ +def find_candidates(ctx, min_size: float, max_size: float, limit: int, table: Optional[str], node: Optional[str]): + """Find shard candidates for movement based on size criteria""" client = ctx.obj["client"] analyzer = ShardAnalyzer(client) - - console.print(Panel.fit(f"[bold blue]Finding Moveable Shards ({min_size}-{max_size}GB)[/bold blue]")) - - if node: - console.print(f"[dim]Filtering: Only showing candidates from source node '{node}'[/dim]") - - # Find moveable candidates (only healthy shards suitable for operations) - candidates = analyzer.find_moveable_shards(min_size, max_size, table) - - # Filter by node if specified - if node: - candidates = [c for c in candidates if c.node_name == node] - - if not candidates: - if node: - console.print(f"[yellow]No moveable shards found on node '{node}' in the specified size range.[/yellow]") - console.print("[dim]Tip: Try different size ranges or remove --node filter to see all candidates[/dim]") - else: - console.print("[yellow]No moveable shards found in the specified size range.[/yellow]") - return - - # Show limited results - shown_candidates = candidates[:limit] - - candidates_table = Table( - title=f"Moveable Shard Candidates (showing {len(shown_candidates)} of {len(candidates)})", box=box.ROUNDED + reporter = ShardReporter(analyzer) + reporter.movement_candidates( + criteria=SizeCriteria( + min_size=min_size, + max_size=max_size, + table_name=table, + source_node=node, + ), + limit=limit, ) - candidates_table.add_column("Table", style="cyan") - candidates_table.add_column("Shard ID", justify="right", style="magenta") - candidates_table.add_column("Type", style="blue") - candidates_table.add_column("Node", style="green") - candidates_table.add_column("Zone", style="yellow") - candidates_table.add_column("Size", justify="right", style="red") - candidates_table.add_column("Node Free Space", justify="right", style="white") - candidates_table.add_column("Documents", justify="right", style="dim") - - # Create a mapping of node names to available space for display - node_space_map = {node.name: node.available_space_gb for node in analyzer.nodes} - - for shard in shown_candidates: - node_free_space = node_space_map.get(shard.node_name, 0) - candidates_table.add_row( - f"{shard.schema_name}.{shard.table_name}", - str(shard.shard_id), - shard.shard_type, - shard.node_name, - shard.zone, - format_size(shard.size_gb), - format_size(node_free_space), - f"{shard.num_docs:,}", - ) - - console.print(candidates_table) - - if len(candidates) > limit: - console.print(f"\n[dim]... and {len(candidates) - limit} more candidates[/dim]") @main.command() @@ -499,130 +350,16 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node """ client = ctx.obj["client"] analyzer = ShardAnalyzer(client) - - # Parse schema and table - if "." not in schema_table: - console.print("[red]Error: Schema and table must be in format 'schema.table'[/red]") - return - - schema_name, table_name = schema_table.split(".", 1) - - console.print(Panel.fit("[bold blue]Validating Shard Move[/bold blue]")) - console.print(f"[dim]Move: {schema_name}.{table_name}[{shard_id}] from {from_node} to {to_node}[/dim]") - console.print() - - # Find the nodes - from_node_info = None - to_node_info = None - for node in analyzer.nodes: - if node.name == from_node: - from_node_info = node - if node.name == to_node: - to_node_info = node - - if not from_node_info: - console.print(f"[red]βœ— Source node '{from_node}' not found in cluster[/red]") - return - - if not to_node_info: - console.print(f"[red]βœ— Target node '{to_node}' not found in cluster[/red]") - return - - # Find the specific shard - target_shard = None - for shard in analyzer.shards: - if ( - shard.schema_name == schema_name - and shard.table_name == table_name - and shard.shard_id == shard_id - and shard.node_name == from_node - ): - target_shard = shard - break - - if not target_shard: - console.print(f"[red]βœ— Shard {shard_id} not found on node {from_node}[/red]") - console.print("[dim]Use 'xmover find-candidates' to see available shards[/dim]") - return - - # Create a move recommendation for validation - recommendation = MoveRecommendation( - table_name=table_name, - schema_name=schema_name, - shard_id=shard_id, - from_node=from_node, - to_node=to_node, - from_zone=from_node_info.zone, - to_zone=to_node_info.zone, - shard_type=target_shard.shard_type, - size_gb=target_shard.size_gb, - reason="Manual validation", - ) - - # Display shard details - details_table = Table(title="Shard Details", box=box.ROUNDED) - details_table.add_column("Property", style="cyan") - details_table.add_column("Value", style="magenta") - - details_table.add_row("Table", f"{schema_name}.{table_name}") - details_table.add_row("Shard ID", str(shard_id)) - details_table.add_row("Type", target_shard.shard_type) - details_table.add_row("Size", format_size(target_shard.size_gb)) - details_table.add_row("Documents", f"{target_shard.num_docs:,}") - details_table.add_row("State", target_shard.state) - details_table.add_row("Routing State", target_shard.routing_state) - details_table.add_row("From Node", f"{from_node} ({from_node_info.zone})") - details_table.add_row("To Node", f"{to_node} ({to_node_info.zone})") - details_table.add_row("Zone Change", "Yes" if from_node_info.zone != to_node_info.zone else "No") - - console.print(details_table) - console.print() - - # Perform comprehensive validation - is_safe, safety_msg = analyzer.validate_move_safety(recommendation, max_disk_usage_percent=max_disk_usage) - - if is_safe: - console.print("[green]βœ“ VALIDATION PASSED - Move appears safe[/green]") - console.print(f"[green]βœ“ {safety_msg}[/green]") - console.print() - - # Show the SQL command - console.print(Panel.fit("[bold green]Ready to Execute[/bold green]")) - console.print("[dim]# Copy and paste this command to execute the move[/dim]") - console.print() - console.print(f"{recommendation.to_sql()}") - console.print() - console.print("[dim]# Monitor shard health after execution[/dim]") - console.print( - "[dim]# Check with: SELECT * FROM sys.shards WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]" + reporter = ShardReporter(analyzer) + reporter.validate_move( + request=ShardMoveRequest( + schema_table=schema_table, + shard_id=shard_id, + from_node=from_node, + to_node=to_node, + max_disk_usage=max_disk_usage, ) - else: - console.print("[red]βœ— VALIDATION FAILED - Move not safe[/red]") - console.print(f"[red]βœ— {safety_msg}[/red]") - console.print() - - # Provide troubleshooting guidance - if "zone conflict" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting Zone Conflicts:[/yellow]") - console.print(" β€’ Check current shard distribution: xmover zone-analysis --show-shards") - console.print(" β€’ Try moving to a different zone") - console.print(" β€’ Verify cluster has proper zone-awareness configuration") - elif "node conflict" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting Node Conflicts:[/yellow]") - console.print(" β€’ The target node already has a copy of this shard") - console.print(" β€’ Choose a different target node") - console.print(" β€’ Check shard distribution: xmover analyze") - elif "space" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting Space Issues:[/yellow]") - console.print(" β€’ Free up space on the target node") - console.print(" β€’ Choose a node with more available capacity") - console.print(" β€’ Check node capacity: xmover analyze") - elif "usage" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting High Disk Usage:[/yellow]") - console.print(" β€’ Wait for target node disk usage to decrease") - console.print(" β€’ Choose a node with lower disk usage") - console.print(" β€’ Check cluster health: xmover analyze") - console.print(" β€’ Consider using --max-disk-usage option for urgent moves") + ) @main.command() diff --git a/cratedb_toolkit/admin/xmover/model.py b/cratedb_toolkit/admin/xmover/model.py index 12286597..056d834c 100644 --- a/cratedb_toolkit/admin/xmover/model.py +++ b/cratedb_toolkit/admin/xmover/model.py @@ -103,7 +103,18 @@ def translog_percentage(self) -> float: @dataclass -class MoveRecommendation: +class ShardMoveRequest: + """Request for moving a shard""" + + schema_table: str + shard_id: int + from_node: str + to_node: str + max_disk_usage: float + + +@dataclass +class ShardMoveRecommendation: """Recommendation for moving a shard""" table_name: str @@ -154,6 +165,14 @@ class DistributionStats: node_balance_score: float # 0-100, higher is better +@dataclasses.dataclass +class SizeCriteria: + min_size: float = 40.0 + max_size: float = 60.0 + table_name: Optional[str] = None + source_node: Optional[str] = None + + @dataclasses.dataclass class RecommendationConstraints: min_size: float = 40.0 diff --git a/cratedb_toolkit/admin/xmover/reporter.py b/cratedb_toolkit/admin/xmover/reporter.py new file mode 100644 index 00000000..6912bc54 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/reporter.py @@ -0,0 +1,325 @@ +from typing import Any, Dict + +from rich import box +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from cratedb_toolkit.admin.xmover.analyzer import ShardAnalyzer +from cratedb_toolkit.admin.xmover.model import ShardMoveRecommendation, ShardMoveRequest, SizeCriteria +from cratedb_toolkit.admin.xmover.util import format_percentage, format_size + +console = Console() + + +class ShardReporter: + def __init__(self, analyzer: ShardAnalyzer): + self.analyzer = analyzer + + def distribution(self, table: str = None): + """Analyze current shard distribution across nodes and zones""" + console.print(Panel.fit("[bold blue]CrateDB Cluster Analysis[/bold blue]")) + + # Get cluster overview (includes all shards for complete analysis) + overview: Dict[str, Any] = self.analyzer.get_cluster_overview() + + # Cluster summary table + summary_table = Table(title="Cluster Summary", box=box.ROUNDED) + summary_table.add_column("Metric", style="cyan") + summary_table.add_column("Value", style="magenta") + + summary_table.add_row("Nodes", str(overview["nodes"])) + summary_table.add_row("Availability Zones", str(overview["zones"])) + summary_table.add_row("Total Shards", str(overview["total_shards"])) + summary_table.add_row("Primary Shards", str(overview["primary_shards"])) + summary_table.add_row("Replica Shards", str(overview["replica_shards"])) + summary_table.add_row("Total Size", format_size(overview["total_size_gb"])) + + console.print(summary_table) + console.print() + + # Disk watermarks table + if overview.get("watermarks"): + watermarks_table = Table(title="Disk Allocation Watermarks", box=box.ROUNDED) + watermarks_table.add_column("Setting", style="cyan") + watermarks_table.add_column("Value", style="magenta") + + watermarks = overview["watermarks"] + watermarks_table.add_row("Low Watermark", str(watermarks.get("low", "Not set"))) + watermarks_table.add_row("High Watermark", str(watermarks.get("high", "Not set"))) + watermarks_table.add_row("Flood Stage", str(watermarks.get("flood_stage", "Not set"))) + watermarks_table.add_row( + "Enable for Single Node", str(watermarks.get("enable_for_single_data_node", "Not set")) + ) + + console.print(watermarks_table) + console.print() + + # Zone distribution table + zone_table = Table(title="Zone Distribution", box=box.ROUNDED) + zone_table.add_column("Zone", style="cyan") + zone_table.add_column("Shards", justify="right", style="magenta") + zone_table.add_column("Percentage", justify="right", style="green") + + total_shards = overview["total_shards"] + for zone, count in overview["zone_distribution"].items(): + percentage = (count / total_shards * 100) if total_shards > 0 else 0 + zone_table.add_row(zone, str(count), f"{percentage:.1f}%") + + console.print(zone_table) + console.print() + + # Node health table + node_table = Table(title="Node Health", box=box.ROUNDED) + node_table.add_column("Node", style="cyan") + node_table.add_column("Zone", style="blue") + node_table.add_column("Shards", justify="right", style="magenta") + node_table.add_column("Size", justify="right", style="green") + node_table.add_column("Disk Usage", justify="right") + node_table.add_column("Available Space", justify="right", style="green") + node_table.add_column("Until Low WM", justify="right", style="yellow") + node_table.add_column("Until High WM", justify="right", style="red") + + for node_info in overview["node_health"]: + # Format watermark remaining capacity + low_wm_remaining = ( + format_size(node_info["remaining_to_low_watermark_gb"]) + if node_info["remaining_to_low_watermark_gb"] > 0 + else "[red]Exceeded[/red]" + ) + high_wm_remaining = ( + format_size(node_info["remaining_to_high_watermark_gb"]) + if node_info["remaining_to_high_watermark_gb"] > 0 + else "[red]Exceeded[/red]" + ) + + node_table.add_row( + node_info["name"], + node_info["zone"], + str(node_info["shards"]), + format_size(node_info["size_gb"]), + format_percentage(node_info["disk_usage_percent"]), + format_size(node_info["available_space_gb"]), + low_wm_remaining, + high_wm_remaining, + ) + + console.print(node_table) + + # Table-specific analysis if requested + if table: + console.print() + console.print(Panel.fit(f"[bold blue]Analysis for table: {table}[/bold blue]")) + + stats = self.analyzer.analyze_distribution(table) + + table_summary = Table(title=f"Table {table} Distribution", box=box.ROUNDED) + table_summary.add_column("Metric", style="cyan") + table_summary.add_column("Value", style="magenta") + + table_summary.add_row("Total Shards", str(stats.total_shards)) + table_summary.add_row("Total Size", format_size(stats.total_size_gb)) + table_summary.add_row("Zone Balance Score", f"{stats.zone_balance_score:.1f}/100") + table_summary.add_row("Node Balance Score", f"{stats.node_balance_score:.1f}/100") + + console.print(table_summary) + + def movement_candidates(self, criteria: SizeCriteria, limit: int): + """ + Find shard candidates for movement based on size criteria + + Results are sorted by nodes with least available space first, + then by shard size (smallest first) for easier moves. + """ + + console.print( + Panel.fit(f"[bold blue]Finding Moveable Shards ({criteria.min_size}-{criteria.max_size}GB)[/bold blue]") + ) + + if criteria.source_node: + console.print(f"[dim]Filtering: Only showing candidates from source node '{criteria.source_node}'[/dim]") + + # Find moveable candidates (only healthy shards suitable for operations) + candidates = self.analyzer.find_moveable_shards(criteria.min_size, criteria.max_size, criteria.table_name) + + # Filter by node if specified + if criteria.source_node: + candidates = [c for c in candidates if c.node_name == criteria.source_node] + + if not candidates: + if criteria.source_node: + console.print( + f"[yellow]No moveable shards found on node '{criteria.source_node}' " + f"in the specified size range.[/yellow]" + ) + console.print("[dim]Tip: Try different size ranges or remove --node filter to see all candidates[/dim]") + else: + console.print("[yellow]No moveable shards found in the specified size range.[/yellow]") + return + + # Show limited results + shown_candidates = candidates[:limit] + + candidates_table = Table( + title=f"Moveable Shard Candidates (showing {len(shown_candidates)} of {len(candidates)})", box=box.ROUNDED + ) + candidates_table.add_column("Table", style="cyan") + candidates_table.add_column("Shard ID", justify="right", style="magenta") + candidates_table.add_column("Type", style="blue") + candidates_table.add_column("Node", style="green") + candidates_table.add_column("Zone", style="yellow") + candidates_table.add_column("Size", justify="right", style="red") + candidates_table.add_column("Node Free Space", justify="right", style="white") + candidates_table.add_column("Documents", justify="right", style="dim") + + # Create a mapping of node names to available space for display + node_space_map = {node.name: node.available_space_gb for node in self.analyzer.nodes} + + for shard in shown_candidates: + node_free_space = node_space_map.get(shard.node_name, 0) + candidates_table.add_row( + f"{shard.schema_name}.{shard.table_name}", + str(shard.shard_id), + shard.shard_type, + shard.node_name, + shard.zone, + format_size(shard.size_gb), + format_size(node_free_space), + f"{shard.num_docs:,}", + ) + + console.print(candidates_table) + + if len(candidates) > limit: + console.print(f"\n[dim]... and {len(candidates) - limit} more candidates[/dim]") + + def validate_move(self, request: ShardMoveRequest): + # Parse schema and table + if "." not in request.schema_table: + console.print("[red]Error: Schema and table must be in format 'schema.table'[/red]") + return + + schema_name, table_name = request.schema_table.split(".", 1) + + console.print(Panel.fit("[bold blue]Validating Shard Move[/bold blue]")) + console.print( + f"[dim]Move: {schema_name}.{table_name}[{request.shard_id}] " + f"from {request.from_node} to {request.to_node}[/dim]" + ) + console.print() + + # Find the nodes + from_node_info = None + to_node_info = None + for node in self.analyzer.nodes: + if node.name == request.from_node: + from_node_info = node + if node.name == request.to_node: + to_node_info = node + + if not from_node_info: + console.print(f"[red]βœ— Source node '{request.from_node}' not found in cluster[/red]") + return + + if not to_node_info: + console.print(f"[red]βœ— Target node '{request.to_node}' not found in cluster[/red]") + return + + # Find the specific shard + target_shard = None + for shard in self.analyzer.shards: + if ( + shard.schema_name == schema_name + and shard.table_name == table_name + and shard.shard_id == request.shard_id + and shard.node_name == request.from_node + ): + target_shard = shard + break + + if not target_shard: + console.print(f"[red]βœ— Shard {request.shard_id} not found on node {request.from_node}[/red]") + console.print("[dim]Use 'xmover find-candidates' to see available shards[/dim]") + return + + # Create a move recommendation for validation + recommendation = ShardMoveRecommendation( + table_name=table_name, + schema_name=schema_name, + shard_id=request.shard_id, + from_node=request.from_node, + to_node=request.to_node, + from_zone=from_node_info.zone, + to_zone=to_node_info.zone, + shard_type=target_shard.shard_type, + size_gb=target_shard.size_gb, + reason="Manual validation", + ) + + # Display shard details + details_table = Table(title="Shard Details", box=box.ROUNDED) + details_table.add_column("Property", style="cyan") + details_table.add_column("Value", style="magenta") + + details_table.add_row("Table", f"{schema_name}.{table_name}") + details_table.add_row("Shard ID", str(request.shard_id)) + details_table.add_row("Type", target_shard.shard_type) + details_table.add_row("Size", format_size(target_shard.size_gb)) + details_table.add_row("Documents", f"{target_shard.num_docs:,}") + details_table.add_row("State", target_shard.state) + details_table.add_row("Routing State", target_shard.routing_state) + details_table.add_row("From Node", f"{request.from_node} ({from_node_info.zone})") + details_table.add_row("To Node", f"{request.to_node} ({to_node_info.zone})") + details_table.add_row("Zone Change", "Yes" if from_node_info.zone != to_node_info.zone else "No") + + console.print(details_table) + console.print() + + # Perform comprehensive validation + is_safe, safety_msg = self.analyzer.validate_move_safety( + recommendation, max_disk_usage_percent=request.max_disk_usage + ) + + if is_safe: + console.print("[green]βœ“ VALIDATION PASSED - Move appears safe[/green]") + console.print(f"[green]βœ“ {safety_msg}[/green]") + console.print() + + # Show the SQL command + console.print(Panel.fit("[bold green]Ready to Execute[/bold green]")) + console.print("[dim]# Copy and paste this command to execute the move[/dim]") + console.print() + console.print(f"{recommendation.to_sql()}") + console.print() + console.print("[dim]# Monitor shard health after execution[/dim]") + console.print( + "[dim]# Check with: SELECT * FROM sys.shards " + "WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]" + ) + else: + console.print("[red]βœ— VALIDATION FAILED - Move not safe[/red]") + console.print(f"[red]βœ— {safety_msg}[/red]") + console.print() + + # Provide troubleshooting guidance + if "zone conflict" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Zone Conflicts:[/yellow]") + console.print(" β€’ Check current shard distribution: xmover zone-analysis --show-shards") + console.print(" β€’ Try moving to a different zone") + console.print(" β€’ Verify cluster has proper zone-awareness configuration") + elif "node conflict" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Node Conflicts:[/yellow]") + console.print(" β€’ The target node already has a copy of this shard") + console.print(" β€’ Choose a different target node") + console.print(" β€’ Check shard distribution: xmover analyze") + elif "space" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Space Issues:[/yellow]") + console.print(" β€’ Free up space on the target node") + console.print(" β€’ Choose a node with more available capacity") + console.print(" β€’ Check node capacity: xmover analyze") + elif "usage" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting High Disk Usage:[/yellow]") + console.print(" β€’ Wait for target node disk usage to decrease") + console.print(" β€’ Choose a node with lower disk usage") + console.print(" β€’ Check cluster health: xmover analyze") + console.print(" β€’ Consider using --max-disk-usage option for urgent moves") From 745b23a735ed8ad60e5032378aee157e2a8a9f10 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 20:37:58 +0200 Subject: [PATCH 09/18] Admin/XMover: Refactor -- analyze -- shard vs. zone --- .../admin/xmover/analyze/__init__.py | 0 .../xmover/{reporter.py => analyze/report.py} | 2 +- .../xmover/{analyzer.py => analyze/shard.py} | 10 +- cratedb_toolkit/admin/xmover/analyze/zone.py | 159 ++++++++++++++++++ cratedb_toolkit/admin/xmover/cli.py | 152 +---------------- cratedb_toolkit/admin/xmover/recommender.py | 2 +- pyproject.toml | 2 +- 7 files changed, 178 insertions(+), 149 deletions(-) create mode 100644 cratedb_toolkit/admin/xmover/analyze/__init__.py rename cratedb_toolkit/admin/xmover/{reporter.py => analyze/report.py} (99%) rename cratedb_toolkit/admin/xmover/{analyzer.py => analyze/shard.py} (99%) create mode 100644 cratedb_toolkit/admin/xmover/analyze/zone.py diff --git a/cratedb_toolkit/admin/xmover/analyze/__init__.py b/cratedb_toolkit/admin/xmover/analyze/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cratedb_toolkit/admin/xmover/reporter.py b/cratedb_toolkit/admin/xmover/analyze/report.py similarity index 99% rename from cratedb_toolkit/admin/xmover/reporter.py rename to cratedb_toolkit/admin/xmover/analyze/report.py index 6912bc54..ec0e6e3b 100644 --- a/cratedb_toolkit/admin/xmover/reporter.py +++ b/cratedb_toolkit/admin/xmover/analyze/report.py @@ -5,7 +5,7 @@ from rich.panel import Panel from rich.table import Table -from cratedb_toolkit.admin.xmover.analyzer import ShardAnalyzer +from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer from cratedb_toolkit.admin.xmover.model import ShardMoveRecommendation, ShardMoveRequest, SizeCriteria from cratedb_toolkit.admin.xmover.util import format_percentage, format_size diff --git a/cratedb_toolkit/admin/xmover/analyzer.py b/cratedb_toolkit/admin/xmover/analyze/shard.py similarity index 99% rename from cratedb_toolkit/admin/xmover/analyzer.py rename to cratedb_toolkit/admin/xmover/analyze/shard.py index 98af6a21..f4c179fd 100644 --- a/cratedb_toolkit/admin/xmover/analyzer.py +++ b/cratedb_toolkit/admin/xmover/analyze/shard.py @@ -7,8 +7,14 @@ from collections import defaultdict from typing import Any, Dict, List, Optional, Set, Tuple, Union -from .database import CrateDBClient -from .model import DistributionStats, NodeInfo, RecommendationConstraints, ShardInfo, ShardMoveRecommendation +from cratedb_toolkit.admin.xmover.database import CrateDBClient +from cratedb_toolkit.admin.xmover.model import ( + DistributionStats, + NodeInfo, + RecommendationConstraints, + ShardInfo, + ShardMoveRecommendation, +) logger = logging.getLogger(__name__) diff --git a/cratedb_toolkit/admin/xmover/analyze/zone.py b/cratedb_toolkit/admin/xmover/analyze/zone.py new file mode 100644 index 00000000..5b208c2f --- /dev/null +++ b/cratedb_toolkit/admin/xmover/analyze/zone.py @@ -0,0 +1,159 @@ +from typing import Dict, List, Optional + +from rich import box +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.database import CrateDBClient +from cratedb_toolkit.admin.xmover.model import ShardInfo + +console = Console() + + +class ZoneReport: + def __init__(self, client: CrateDBClient): + self.client = client + self.analyzer = ShardAnalyzer(self.client) + + def shard_balance(self, tolerance: float, table: Optional[str] = None): + """Check zone balance for shards""" + console.print(Panel.fit("[bold blue]Zone Balance Check[/bold blue]")) + console.print("[dim]Note: Analyzing all shards regardless of state for complete cluster view[/dim]") + console.print() + + zone_stats = self.analyzer.check_zone_balance(table, tolerance) + + if not zone_stats: + console.print("[yellow]No shards found for analysis[/yellow]") + return + + # Calculate totals and targets + total_shards = sum(stats["TOTAL"] for stats in zone_stats.values()) + zones = list(zone_stats.keys()) + target_per_zone = total_shards // len(zones) if zones else 0 + tolerance_range = (target_per_zone * (1 - tolerance / 100), target_per_zone * (1 + tolerance / 100)) + + balance_table = Table(title=f"Zone Balance Analysis (Target: {target_per_zone} Β±{tolerance}%)", box=box.ROUNDED) + balance_table.add_column("Zone", style="cyan") + balance_table.add_column("Primary", justify="right", style="blue") + balance_table.add_column("Replica", justify="right", style="green") + balance_table.add_column("Total", justify="right", style="magenta") + balance_table.add_column("Status", style="bold") + + for zone, stats in zone_stats.items(): + total = stats["TOTAL"] + + if tolerance_range[0] <= total <= tolerance_range[1]: + status = "[green]βœ“ Balanced[/green]" + elif total < tolerance_range[0]: + status = f"[yellow]⚠ Under ({total - target_per_zone:+})[/yellow]" + else: + status = f"[red]⚠ Over ({total - target_per_zone:+})[/red]" + + balance_table.add_row(zone, str(stats["PRIMARY"]), str(stats["REPLICA"]), str(total), status) + + console.print(balance_table) + + def distribution_conflicts(self, shard_details: bool = False, table: Optional[str] = None): + """Detailed analysis of zone distribution and potential conflicts""" + console.print(Panel.fit("[bold blue]Detailed Zone Analysis[/bold blue]")) + console.print("[dim]Comprehensive zone distribution analysis for CrateDB cluster[/dim]") + console.print() + + # Get all shards for analysis + shards = self.client.get_shards_info(table_name=table, for_analysis=True) + + if not shards: + console.print("[yellow]No shards found for analysis[/yellow]") + return + + # Organize by table and shard + tables: Dict[str, Dict[int, List[ShardInfo]]] = {} + for shard in shards: + table_key = f"{shard.schema_name}.{shard.table_name}" + if table_key not in tables: + tables[table_key] = {} + + shard_key = shard.shard_id + if shard_key not in tables[table_key]: + tables[table_key][shard_key] = [] + + tables[table_key][shard_key].append(shard) + + # Analyze each table + zone_conflicts = 0 + under_replicated = 0 + + for table_name, table_shards in tables.items(): + console.print(f"\n[bold cyan]Table: {table_name}[/bold cyan]") + + # Create analysis table + analysis_table = Table(title=f"Shard Distribution for {table_name}", box=box.ROUNDED) + analysis_table.add_column("Shard ID", justify="right", style="magenta") + analysis_table.add_column("Primary Zone", style="blue") + analysis_table.add_column("Replica Zones", style="green") + analysis_table.add_column("Total Copies", justify="right", style="cyan") + analysis_table.add_column("Status", style="bold") + + for shard_id, shard_copies in sorted(table_shards.items()): + primary_zone = "Unknown" + replica_zones = set() + total_copies = len(shard_copies) + zones_with_copies = set() + + for shard_copy in shard_copies: + zones_with_copies.add(shard_copy.zone) + if shard_copy.is_primary: + primary_zone = shard_copy.zone + else: + replica_zones.add(shard_copy.zone) + + # Determine status + status_parts = [] + if len(zones_with_copies) == 1: + zone_conflicts += 1 + status_parts.append("[red]⚠ ZONE CONFLICT[/red]") + + if total_copies < 2: # Assuming we want at least 1 replica + under_replicated += 1 + status_parts.append("[yellow]⚠ Under-replicated[/yellow]") + + if not status_parts: + status_parts.append("[green]βœ“ Good[/green]") + + replica_zones_str = ", ".join(sorted(replica_zones)) if replica_zones else "None" + + analysis_table.add_row( + str(shard_id), primary_zone, replica_zones_str, str(total_copies), " ".join(status_parts) + ) + + # Show individual shard details if requested + if shard_details: + for shard_copy in shard_copies: + health_indicator = "βœ“" if shard_copy.routing_state == "STARTED" else "⚠" + console.print( + f" {health_indicator} {shard_copy.shard_type} " + f"on {shard_copy.node_name} ({shard_copy.zone}) - {shard_copy.routing_state}" + ) + + console.print(analysis_table) + + # Summary + console.print("\n[bold]Zone Analysis Summary:[/bold]") + console.print(f" β€’ Tables analyzed: [cyan]{len(tables)}[/cyan]") + console.print(f" β€’ Zone conflicts detected: [red]{zone_conflicts}[/red]") + console.print(f" β€’ Under-replicated shards: [yellow]{under_replicated}[/yellow]") + + if zone_conflicts > 0: + console.print(f"\n[red]⚠ Found {zone_conflicts} zone conflicts that need attention![/red]") + console.print("[dim]Zone conflicts occur when all copies of a shard are in the same zone.[/dim]") + console.print("[dim]This violates CrateDB's zone-awareness and creates availability risks.[/dim]") + + if under_replicated > 0: + console.print(f"\n[yellow]⚠ Found {under_replicated} under-replicated shards.[/yellow]") + console.print("[dim]Consider increasing replication for better availability.[/dim]") + + if zone_conflicts == 0 and under_replicated == 0: + console.print("\n[green]βœ“ No critical zone distribution issues detected![/green]") diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 76143895..159bcd0c 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -5,24 +5,22 @@ """ import sys -from typing import Dict, List, Optional, cast +from typing import List, Optional, cast import click -from rich import box from rich.console import Console from rich.panel import Panel -from rich.table import Table +from cratedb_toolkit.admin.xmover.analyze.report import ShardReporter +from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.analyze.zone import ZoneReport from cratedb_toolkit.admin.xmover.model import ( RecommendationConstraints, - ShardInfo, ShardMoveRequest, SizeCriteria, ) from cratedb_toolkit.admin.xmover.recommender import Recommender -from cratedb_toolkit.admin.xmover.reporter import ShardReporter -from .analyzer import ShardAnalyzer from .database import CrateDBClient from .recovery import RecoveryMonitor, RecoveryOptions @@ -182,44 +180,8 @@ def test_connection(ctx, connection_string: Optional[str]): def check_balance(ctx, table: Optional[str], tolerance: float): """Check zone balance for shards""" client = ctx.obj["client"] - analyzer = ShardAnalyzer(client) - - console.print(Panel.fit("[bold blue]Zone Balance Check[/bold blue]")) - console.print("[dim]Note: Analyzing all shards regardless of state for complete cluster view[/dim]") - console.print() - - zone_stats = analyzer.check_zone_balance(table, tolerance) - - if not zone_stats: - console.print("[yellow]No shards found for analysis[/yellow]") - return - - # Calculate totals and targets - total_shards = sum(stats["TOTAL"] for stats in zone_stats.values()) - zones = list(zone_stats.keys()) - target_per_zone = total_shards // len(zones) if zones else 0 - tolerance_range = (target_per_zone * (1 - tolerance / 100), target_per_zone * (1 + tolerance / 100)) - - balance_table = Table(title=f"Zone Balance Analysis (Target: {target_per_zone} Β±{tolerance}%)", box=box.ROUNDED) - balance_table.add_column("Zone", style="cyan") - balance_table.add_column("Primary", justify="right", style="blue") - balance_table.add_column("Replica", justify="right", style="green") - balance_table.add_column("Total", justify="right", style="magenta") - balance_table.add_column("Status", style="bold") - - for zone, stats in zone_stats.items(): - total = stats["TOTAL"] - - if tolerance_range[0] <= total <= tolerance_range[1]: - status = "[green]βœ“ Balanced[/green]" - elif total < tolerance_range[0]: - status = f"[yellow]⚠ Under ({total - target_per_zone:+})[/yellow]" - else: - status = f"[red]⚠ Over ({total - target_per_zone:+})[/red]" - - balance_table.add_row(zone, str(stats["PRIMARY"]), str(stats["REPLICA"]), str(total), status) - - console.print(balance_table) + report = ZoneReport(client=client) + report.shard_balance(tolerance=tolerance, table=table) @main.command() @@ -229,106 +191,8 @@ def check_balance(ctx, table: Optional[str], tolerance: float): def zone_analysis(ctx, table: Optional[str], show_shards: bool): """Detailed analysis of zone distribution and potential conflicts""" client = ctx.obj["client"] - - console.print(Panel.fit("[bold blue]Detailed Zone Analysis[/bold blue]")) - console.print("[dim]Comprehensive zone distribution analysis for CrateDB cluster[/dim]") - console.print() - - # Get all shards for analysis - shards = client.get_shards_info(table_name=table, for_analysis=True) - - if not shards: - console.print("[yellow]No shards found for analysis[/yellow]") - return - - # Organize by table and shard - tables: Dict[str, Dict[str, List[ShardInfo]]] = {} - for shard in shards: - table_key = f"{shard.schema_name}.{shard.table_name}" - if table_key not in tables: - tables[table_key] = {} - - shard_key = shard.shard_id - if shard_key not in tables[table_key]: - tables[table_key][shard_key] = [] - - tables[table_key][shard_key].append(shard) - - # Analyze each table - zone_conflicts = 0 - under_replicated = 0 - - for table_name, table_shards in tables.items(): - console.print(f"\n[bold cyan]Table: {table_name}[/bold cyan]") - - # Create analysis table - analysis_table = Table(title=f"Shard Distribution for {table_name}", box=box.ROUNDED) - analysis_table.add_column("Shard ID", justify="right", style="magenta") - analysis_table.add_column("Primary Zone", style="blue") - analysis_table.add_column("Replica Zones", style="green") - analysis_table.add_column("Total Copies", justify="right", style="cyan") - analysis_table.add_column("Status", style="bold") - - for shard_id, shard_copies in sorted(table_shards.items()): - primary_zone = "Unknown" - replica_zones = set() - total_copies = len(shard_copies) - zones_with_copies = set() - - for shard_copy in shard_copies: - zones_with_copies.add(shard_copy.zone) - if shard_copy.is_primary: - primary_zone = shard_copy.zone - else: - replica_zones.add(shard_copy.zone) - - # Determine status - status_parts = [] - if len(zones_with_copies) == 1: - zone_conflicts += 1 - status_parts.append("[red]⚠ ZONE CONFLICT[/red]") - - if total_copies < 2: # Assuming we want at least 1 replica - under_replicated += 1 - status_parts.append("[yellow]⚠ Under-replicated[/yellow]") - - if not status_parts: - status_parts.append("[green]βœ“ Good[/green]") - - replica_zones_str = ", ".join(sorted(replica_zones)) if replica_zones else "None" - - analysis_table.add_row( - str(shard_id), primary_zone, replica_zones_str, str(total_copies), " ".join(status_parts) - ) - - # Show individual shard details if requested - if show_shards: - for shard_copy in shard_copies: - health_indicator = "βœ“" if shard_copy.routing_state == "STARTED" else "⚠" - console.print( - f" {health_indicator} {shard_copy.shard_type} " - f"on {shard_copy.node_name} ({shard_copy.zone}) - {shard_copy.routing_state}" - ) - - console.print(analysis_table) - - # Summary - console.print("\n[bold]Zone Analysis Summary:[/bold]") - console.print(f" β€’ Tables analyzed: [cyan]{len(tables)}[/cyan]") - console.print(f" β€’ Zone conflicts detected: [red]{zone_conflicts}[/red]") - console.print(f" β€’ Under-replicated shards: [yellow]{under_replicated}[/yellow]") - - if zone_conflicts > 0: - console.print(f"\n[red]⚠ Found {zone_conflicts} zone conflicts that need attention![/red]") - console.print("[dim]Zone conflicts occur when all copies of a shard are in the same zone.[/dim]") - console.print("[dim]This violates CrateDB's zone-awareness and creates availability risks.[/dim]") - - if under_replicated > 0: - console.print(f"\n[yellow]⚠ Found {under_replicated} under-replicated shards.[/yellow]") - console.print("[dim]Consider increasing replication for better availability.[/dim]") - - if zone_conflicts == 0 and under_replicated == 0: - console.print("\n[green]βœ“ No critical zone distribution issues detected![/green]") + report = ZoneReport(client=client) + report.distribution_conflicts(shard_details=show_shards, table=table) @main.command() diff --git a/cratedb_toolkit/admin/xmover/recommender.py b/cratedb_toolkit/admin/xmover/recommender.py index 7e780600..ceeba003 100644 --- a/cratedb_toolkit/admin/xmover/recommender.py +++ b/cratedb_toolkit/admin/xmover/recommender.py @@ -5,7 +5,7 @@ from rich.panel import Panel from rich.table import Table -from .analyzer import ShardAnalyzer +from .analyze.shard import ShardAnalyzer from .database import CrateDBClient from .model import RecommendationConstraints from .recovery import RecoveryMonitor, RecoveryOptions diff --git a/pyproject.toml b/pyproject.toml index bd3fa6be..f1f32203 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -324,7 +324,7 @@ lint.extend-ignore = [ "S108", ] -lint.per-file-ignores."cratedb_toolkit/admin/xmover/analyzer.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."cratedb_toolkit/admin/xmover/analyze/shard.py" = [ "T201" ] # Allow `print` lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused lint.per-file-ignores."doc/conf.py" = [ "A001", "ERA001" ] From d2cff3f337a4b2cc6075b75d4ff23f88f8d928c4 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 20:46:53 +0200 Subject: [PATCH 10/18] Admin/XMover: Refactor -- tune --- cratedb_toolkit/admin/xmover/cli.py | 4 ++-- cratedb_toolkit/admin/xmover/tune/__init__.py | 0 .../admin/xmover/{recommender.py => tune/recommend.py} | 10 +++++----- .../admin/xmover/{recovery.py => tune/recover.py} | 0 pyproject.toml | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-) create mode 100644 cratedb_toolkit/admin/xmover/tune/__init__.py rename cratedb_toolkit/admin/xmover/{recommender.py => tune/recommend.py} (97%) rename cratedb_toolkit/admin/xmover/{recovery.py => tune/recover.py} (100%) diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 159bcd0c..d3a55fa8 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -19,10 +19,10 @@ ShardMoveRequest, SizeCriteria, ) -from cratedb_toolkit.admin.xmover.recommender import Recommender +from cratedb_toolkit.admin.xmover.tune.recommend import Recommender +from cratedb_toolkit.admin.xmover.tune.recover import RecoveryMonitor, RecoveryOptions from .database import CrateDBClient -from .recovery import RecoveryMonitor, RecoveryOptions console = Console() diff --git a/cratedb_toolkit/admin/xmover/tune/__init__.py b/cratedb_toolkit/admin/xmover/tune/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cratedb_toolkit/admin/xmover/recommender.py b/cratedb_toolkit/admin/xmover/tune/recommend.py similarity index 97% rename from cratedb_toolkit/admin/xmover/recommender.py rename to cratedb_toolkit/admin/xmover/tune/recommend.py index ceeba003..ea427bd7 100644 --- a/cratedb_toolkit/admin/xmover/recommender.py +++ b/cratedb_toolkit/admin/xmover/tune/recommend.py @@ -5,11 +5,11 @@ from rich.panel import Panel from rich.table import Table -from .analyze.shard import ShardAnalyzer -from .database import CrateDBClient -from .model import RecommendationConstraints -from .recovery import RecoveryMonitor, RecoveryOptions -from .util import format_size +from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.database import CrateDBClient +from cratedb_toolkit.admin.xmover.model import RecommendationConstraints +from cratedb_toolkit.admin.xmover.tune.recover import RecoveryMonitor, RecoveryOptions +from cratedb_toolkit.admin.xmover.util import format_size console = Console() diff --git a/cratedb_toolkit/admin/xmover/recovery.py b/cratedb_toolkit/admin/xmover/tune/recover.py similarity index 100% rename from cratedb_toolkit/admin/xmover/recovery.py rename to cratedb_toolkit/admin/xmover/tune/recover.py diff --git a/pyproject.toml b/pyproject.toml index f1f32203..e296dd5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -325,11 +325,11 @@ lint.extend-ignore = [ ] lint.per-file-ignores."cratedb_toolkit/admin/xmover/analyze/shard.py" = [ "T201" ] # Allow `print` -lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` -lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused +lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused lint.per-file-ignores."doc/conf.py" = [ "A001", "ERA001" ] -lint.per-file-ignores."examples/*" = [ "ERA001", "F401", "T201", "T203" ] # Allow `print` and `pprint` -lint.per-file-ignores."tests/*" = [ "S101" ] # Allow use of `assert`, and `print`. +lint.per-file-ignores."examples/*" = [ "ERA001", "F401", "T201", "T203" ] # Allow `print` and `pprint` +lint.per-file-ignores."tests/*" = [ "S101" ] # Allow use of `assert`, and `print`. lint.per-file-ignores."tests/adapter/test_rockset.py" = [ "E402" ] lint.per-file-ignores."tests/info/test_http.py" = [ "E402" ] From a91c8a170fae604fc1e09a850e4bbee6bb22a52a Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Aug 2025 20:52:56 +0200 Subject: [PATCH 11/18] Admin/XMover: Refactor -- util --- .../admin/xmover/analyze/report.py | 2 +- cratedb_toolkit/admin/xmover/analyze/shard.py | 2 +- cratedb_toolkit/admin/xmover/analyze/zone.py | 2 +- cratedb_toolkit/admin/xmover/cli.py | 132 +---------------- .../admin/xmover/tune/recommend.py | 4 +- cratedb_toolkit/admin/xmover/tune/recover.py | 4 +- cratedb_toolkit/admin/xmover/util/__init__.py | 0 .../admin/xmover/{ => util}/database.py | 0 cratedb_toolkit/admin/xmover/util/error.py | 133 ++++++++++++++++++ .../admin/xmover/{util.py => util/format.py} | 0 10 files changed, 144 insertions(+), 135 deletions(-) create mode 100644 cratedb_toolkit/admin/xmover/util/__init__.py rename cratedb_toolkit/admin/xmover/{ => util}/database.py (100%) create mode 100644 cratedb_toolkit/admin/xmover/util/error.py rename cratedb_toolkit/admin/xmover/{util.py => util/format.py} (100%) diff --git a/cratedb_toolkit/admin/xmover/analyze/report.py b/cratedb_toolkit/admin/xmover/analyze/report.py index ec0e6e3b..eff0f399 100644 --- a/cratedb_toolkit/admin/xmover/analyze/report.py +++ b/cratedb_toolkit/admin/xmover/analyze/report.py @@ -7,7 +7,7 @@ from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer from cratedb_toolkit.admin.xmover.model import ShardMoveRecommendation, ShardMoveRequest, SizeCriteria -from cratedb_toolkit.admin.xmover.util import format_percentage, format_size +from cratedb_toolkit.admin.xmover.util.format import format_percentage, format_size console = Console() diff --git a/cratedb_toolkit/admin/xmover/analyze/shard.py b/cratedb_toolkit/admin/xmover/analyze/shard.py index f4c179fd..334b394e 100644 --- a/cratedb_toolkit/admin/xmover/analyze/shard.py +++ b/cratedb_toolkit/admin/xmover/analyze/shard.py @@ -7,7 +7,6 @@ from collections import defaultdict from typing import Any, Dict, List, Optional, Set, Tuple, Union -from cratedb_toolkit.admin.xmover.database import CrateDBClient from cratedb_toolkit.admin.xmover.model import ( DistributionStats, NodeInfo, @@ -15,6 +14,7 @@ ShardInfo, ShardMoveRecommendation, ) +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient logger = logging.getLogger(__name__) diff --git a/cratedb_toolkit/admin/xmover/analyze/zone.py b/cratedb_toolkit/admin/xmover/analyze/zone.py index 5b208c2f..18d032eb 100644 --- a/cratedb_toolkit/admin/xmover/analyze/zone.py +++ b/cratedb_toolkit/admin/xmover/analyze/zone.py @@ -6,8 +6,8 @@ from rich.table import Table from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer -from cratedb_toolkit.admin.xmover.database import CrateDBClient from cratedb_toolkit.admin.xmover.model import ShardInfo +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient console = Console() diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index d3a55fa8..d9bb5627 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -5,11 +5,10 @@ """ import sys -from typing import List, Optional, cast +from typing import Optional import click from rich.console import Console -from rich.panel import Panel from cratedb_toolkit.admin.xmover.analyze.report import ShardReporter from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer @@ -21,8 +20,8 @@ ) from cratedb_toolkit.admin.xmover.tune.recommend import Recommender from cratedb_toolkit.admin.xmover.tune.recover import RecoveryMonitor, RecoveryOptions - -from .database import CrateDBClient +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient +from cratedb_toolkit.admin.xmover.util.error import explain_cratedb_error console = Console() @@ -236,130 +235,7 @@ def explain_error(ctx, error_message: Optional[str]): Example: xmover explain-error "NO(a copy of this shard is already allocated to this node)" """ - console.print(Panel.fit("[bold blue]CrateDB Error Message Decoder[/bold blue]")) - console.print("[dim]Helps decode and troubleshoot CrateDB shard allocation errors[/dim]") - console.print() - - if not error_message: - console.print("Please paste the CrateDB error message (press Enter twice when done):") - lines: List[str] = [] - while True: - try: - line = input() - if line.strip() == "" and lines: - break - lines.append(line) - except (EOFError, KeyboardInterrupt): - break - error_message = "\n".join(lines) - - if not error_message.strip(): - console.print("[yellow]No error message provided[/yellow]") - return - - console.print("[dim]Analyzing error message...[/dim]") - console.print() - - # Common CrateDB allocation error patterns and solutions - error_patterns = [ - { - "pattern": "a copy of this shard is already allocated to this node", - "title": "Node Already Has Shard Copy", - "explanation": "The target node already contains a copy (primary or replica) of this shard.", - "solutions": [ - "Choose a different target node that doesn't have this shard", - "Use 'xmover zone-analysis --show-shards' to see current distribution", - "Verify the shard ID and table name are correct", - ], - "prevention": "Always check current shard locations before moving", - }, - { - "pattern": "there are too many copies of the shard allocated to nodes with attribute", - "title": "Zone Allocation Limit Exceeded", - "explanation": "CrateDB's zone awareness prevents too many copies in the same zone.", - "solutions": [ - "Move the shard to a different availability zone", - "Check zone balance with 'xmover check-balance'", - "Ensure target zone doesn't already have copies of this shard", - ], - "prevention": "Use 'xmover recommend' which respects zone constraints", - }, - { - "pattern": "not enough disk space", - "title": "Insufficient Disk Space", - "explanation": "The target node doesn't have enough free disk space for the shard.", - "solutions": [ - "Free up space on the target node", - "Choose a node with more available capacity", - "Check available space with 'xmover analyze'", - ], - "prevention": "Use '--min-free-space' parameter in recommendations", - }, - { - "pattern": "shard recovery limit", - "title": "Recovery Limit Exceeded", - "explanation": "Too many shards are currently being moved/recovered simultaneously.", - "solutions": [ - "Wait for current recoveries to complete", - "Check recovery status in CrateDB admin UI", - "Reduce concurrent recoveries in cluster settings", - ], - "prevention": "Move shards gradually, monitor recovery progress", - }, - { - "pattern": "allocation is disabled", - "title": "Allocation Disabled", - "explanation": "Shard allocation is temporarily disabled in the cluster.", - "solutions": [ - "Re-enable allocation: PUT /_cluster/settings " - '{"persistent":{"cluster.routing.allocation.enable":"all"}}', - "Check if allocation was disabled for maintenance", - "Verify cluster health before re-enabling", - ], - "prevention": "Check allocation status before performing moves", - }, - ] - - # Find matching patterns - matches = [] - error_lower = error_message.lower() - - for pattern_info in error_patterns: - if cast(str, pattern_info["pattern"]).lower() in error_lower: - matches.append(pattern_info) - - if matches: - for i, match in enumerate(matches): - if i > 0: - console.print("\n" + "─" * 60 + "\n") - - console.print(f"[bold red]🚨 {match['title']}[/bold red]") - console.print(f"[yellow]πŸ“ Explanation:[/yellow] {match['explanation']}") - console.print() - - console.print("[green]πŸ’‘ Solutions:[/green]") - for j, solution in enumerate(match["solutions"], 1): - console.print(f" {j}. {solution}") - console.print() - - console.print(f"[blue]πŸ›‘οΈ Prevention:[/blue] {match['prevention']}") - else: - console.print("[yellow]⚠ No specific pattern match found[/yellow]") - console.print() - console.print("[bold]General Troubleshooting Steps:[/bold]") - console.print("1. Check current shard distribution: [cyan]xmover analyze[/cyan]") - console.print( - "2. Validate the specific move: [cyan]xmover validate-move schema.table shard_id from_node to_node[/cyan]" - ) - console.print("3. Check zone conflicts: [cyan]xmover zone-analysis --show-shards[/cyan]") - console.print("4. Verify node capacity: [cyan]xmover analyze[/cyan]") - console.print("5. Review CrateDB documentation on shard allocation") - - console.print() - console.print("[dim]πŸ’‘ Tip: Use 'xmover validate-move' to check moves before execution[/dim]") - console.print( - "[dim]πŸ“š For more help: https://crate.io/docs/crate/reference/en/latest/admin/system-information.html[/dim]" - ) + explain_cratedb_error(error_message) @main.command() diff --git a/cratedb_toolkit/admin/xmover/tune/recommend.py b/cratedb_toolkit/admin/xmover/tune/recommend.py index ea427bd7..704912b1 100644 --- a/cratedb_toolkit/admin/xmover/tune/recommend.py +++ b/cratedb_toolkit/admin/xmover/tune/recommend.py @@ -6,10 +6,10 @@ from rich.table import Table from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer -from cratedb_toolkit.admin.xmover.database import CrateDBClient from cratedb_toolkit.admin.xmover.model import RecommendationConstraints from cratedb_toolkit.admin.xmover.tune.recover import RecoveryMonitor, RecoveryOptions -from cratedb_toolkit.admin.xmover.util import format_size +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient +from cratedb_toolkit.admin.xmover.util.format import format_size console = Console() diff --git a/cratedb_toolkit/admin/xmover/tune/recover.py b/cratedb_toolkit/admin/xmover/tune/recover.py index 958aed15..d88a295f 100644 --- a/cratedb_toolkit/admin/xmover/tune/recover.py +++ b/cratedb_toolkit/admin/xmover/tune/recover.py @@ -5,9 +5,9 @@ from rich.console import Console -from cratedb_toolkit.admin.xmover.database import CrateDBClient from cratedb_toolkit.admin.xmover.model import RecoveryInfo -from cratedb_toolkit.admin.xmover.util import format_translog_info +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient +from cratedb_toolkit.admin.xmover.util.format import format_translog_info console = Console() diff --git a/cratedb_toolkit/admin/xmover/util/__init__.py b/cratedb_toolkit/admin/xmover/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cratedb_toolkit/admin/xmover/database.py b/cratedb_toolkit/admin/xmover/util/database.py similarity index 100% rename from cratedb_toolkit/admin/xmover/database.py rename to cratedb_toolkit/admin/xmover/util/database.py diff --git a/cratedb_toolkit/admin/xmover/util/error.py b/cratedb_toolkit/admin/xmover/util/error.py new file mode 100644 index 00000000..11dd5f39 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/util/error.py @@ -0,0 +1,133 @@ +from typing import List, Optional, cast + +from rich.console import Console +from rich.panel import Panel + +console = Console() + + +def explain_cratedb_error(error_message: Optional[str]): + console.print(Panel.fit("[bold blue]CrateDB Error Message Decoder[/bold blue]")) + console.print("[dim]Helps decode and troubleshoot CrateDB shard allocation errors[/dim]") + console.print() + + if not error_message: + console.print("Please paste the CrateDB error message (press Enter twice when done):") + lines: List[str] = [] + while True: + try: + line = input() + if line.strip() == "" and lines: + break + lines.append(line) + except (EOFError, KeyboardInterrupt): + break + error_message = "\n".join(lines) + + if not error_message.strip(): + console.print("[yellow]No error message provided[/yellow]") + return + + console.print("[dim]Analyzing error message...[/dim]") + console.print() + + # Common CrateDB allocation error patterns and solutions + error_patterns = [ + { + "pattern": "a copy of this shard is already allocated to this node", + "title": "Node Already Has Shard Copy", + "explanation": "The target node already contains a copy (primary or replica) of this shard.", + "solutions": [ + "Choose a different target node that doesn't have this shard", + "Use 'xmover zone-analysis --show-shards' to see current distribution", + "Verify the shard ID and table name are correct", + ], + "prevention": "Always check current shard locations before moving", + }, + { + "pattern": "there are too many copies of the shard allocated to nodes with attribute", + "title": "Zone Allocation Limit Exceeded", + "explanation": "CrateDB's zone awareness prevents too many copies in the same zone.", + "solutions": [ + "Move the shard to a different availability zone", + "Check zone balance with 'xmover check-balance'", + "Ensure target zone doesn't already have copies of this shard", + ], + "prevention": "Use 'xmover recommend' which respects zone constraints", + }, + { + "pattern": "not enough disk space", + "title": "Insufficient Disk Space", + "explanation": "The target node doesn't have enough free disk space for the shard.", + "solutions": [ + "Free up space on the target node", + "Choose a node with more available capacity", + "Check available space with 'xmover analyze'", + ], + "prevention": "Use '--min-free-space' parameter in recommendations", + }, + { + "pattern": "shard recovery limit", + "title": "Recovery Limit Exceeded", + "explanation": "Too many shards are currently being moved/recovered simultaneously.", + "solutions": [ + "Wait for current recoveries to complete", + "Check recovery status in CrateDB admin UI", + "Reduce concurrent recoveries in cluster settings", + ], + "prevention": "Move shards gradually, monitor recovery progress", + }, + { + "pattern": "allocation is disabled", + "title": "Allocation Disabled", + "explanation": "Shard allocation is temporarily disabled in the cluster.", + "solutions": [ + "Re-enable allocation: PUT /_cluster/settings " + '{"persistent":{"cluster.routing.allocation.enable":"all"}}', + "Check if allocation was disabled for maintenance", + "Verify cluster health before re-enabling", + ], + "prevention": "Check allocation status before performing moves", + }, + ] + + # Find matching patterns + matches = [] + error_lower = error_message.lower() + + for pattern_info in error_patterns: + if cast(str, pattern_info["pattern"]).lower() in error_lower: + matches.append(pattern_info) + + if matches: + for i, match in enumerate(matches): + if i > 0: + console.print("\n" + "─" * 60 + "\n") + + console.print(f"[bold red]🚨 {match['title']}[/bold red]") + console.print(f"[yellow]πŸ“ Explanation:[/yellow] {match['explanation']}") + console.print() + + console.print("[green]πŸ’‘ Solutions:[/green]") + for j, solution in enumerate(match["solutions"], 1): + console.print(f" {j}. {solution}") + console.print() + + console.print(f"[blue]πŸ›‘οΈ Prevention:[/blue] {match['prevention']}") + else: + console.print("[yellow]⚠ No specific pattern match found[/yellow]") + console.print() + console.print("[bold]General Troubleshooting Steps:[/bold]") + console.print("1. Check current shard distribution: [cyan]xmover analyze[/cyan]") + console.print( + "2. Validate the specific move: [cyan]xmover validate-move schema.table shard_id from_node to_node[/cyan]" + ) + console.print("3. Check zone conflicts: [cyan]xmover zone-analysis --show-shards[/cyan]") + console.print("4. Verify node capacity: [cyan]xmover analyze[/cyan]") + console.print("5. Review CrateDB documentation on shard allocation") + + console.print() + console.print("[dim]πŸ’‘ Tip: Use 'xmover validate-move' to check moves before execution[/dim]") + console.print( + "[dim]πŸ“š For more help: https://crate.io/docs/crate/reference/en/latest/admin/system-information.html[/dim]" + ) diff --git a/cratedb_toolkit/admin/xmover/util.py b/cratedb_toolkit/admin/xmover/util/format.py similarity index 100% rename from cratedb_toolkit/admin/xmover/util.py rename to cratedb_toolkit/admin/xmover/util/format.py From 2fb889d26ba9ebbbfd2bfc79c2a6752f31a75861 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 21 Aug 2025 02:24:58 +0200 Subject: [PATCH 12/18] Admin/XMover: Naming things -- `analyze` vs. `operational` --- .../xmover/{analyze => analysis}/__init__.py | 0 .../xmover/{analyze => analysis}/report.py | 73 +--------------- .../xmover/{analyze => analysis}/shard.py | 0 .../xmover/{analyze => analysis}/zone.py | 2 +- cratedb_toolkit/admin/xmover/cli.py | 17 ++-- .../xmover/{tune => operational}/__init__.py | 0 .../admin/xmover/operational/candidates.py | 84 +++++++++++++++++++ .../xmover/{tune => operational}/recommend.py | 4 +- .../xmover/{tune => operational}/recover.py | 0 pyproject.toml | 10 +-- 10 files changed, 103 insertions(+), 87 deletions(-) rename cratedb_toolkit/admin/xmover/{analyze => analysis}/__init__.py (100%) rename cratedb_toolkit/admin/xmover/{analyze => analysis}/report.py (78%) rename cratedb_toolkit/admin/xmover/{analyze => analysis}/shard.py (100%) rename cratedb_toolkit/admin/xmover/{analyze => analysis}/zone.py (99%) rename cratedb_toolkit/admin/xmover/{tune => operational}/__init__.py (100%) create mode 100644 cratedb_toolkit/admin/xmover/operational/candidates.py rename cratedb_toolkit/admin/xmover/{tune => operational}/recommend.py (99%) rename cratedb_toolkit/admin/xmover/{tune => operational}/recover.py (100%) diff --git a/cratedb_toolkit/admin/xmover/analyze/__init__.py b/cratedb_toolkit/admin/xmover/analysis/__init__.py similarity index 100% rename from cratedb_toolkit/admin/xmover/analyze/__init__.py rename to cratedb_toolkit/admin/xmover/analysis/__init__.py diff --git a/cratedb_toolkit/admin/xmover/analyze/report.py b/cratedb_toolkit/admin/xmover/analysis/report.py similarity index 78% rename from cratedb_toolkit/admin/xmover/analyze/report.py rename to cratedb_toolkit/admin/xmover/analysis/report.py index eff0f399..f6faf879 100644 --- a/cratedb_toolkit/admin/xmover/analyze/report.py +++ b/cratedb_toolkit/admin/xmover/analysis/report.py @@ -5,8 +5,8 @@ from rich.panel import Panel from rich.table import Table -from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer -from cratedb_toolkit.admin.xmover.model import ShardMoveRecommendation, ShardMoveRequest, SizeCriteria +from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.model import ShardMoveRecommendation, ShardMoveRequest from cratedb_toolkit.admin.xmover.util.format import format_percentage, format_size console = Console() @@ -124,75 +124,6 @@ def distribution(self, table: str = None): console.print(table_summary) - def movement_candidates(self, criteria: SizeCriteria, limit: int): - """ - Find shard candidates for movement based on size criteria - - Results are sorted by nodes with least available space first, - then by shard size (smallest first) for easier moves. - """ - - console.print( - Panel.fit(f"[bold blue]Finding Moveable Shards ({criteria.min_size}-{criteria.max_size}GB)[/bold blue]") - ) - - if criteria.source_node: - console.print(f"[dim]Filtering: Only showing candidates from source node '{criteria.source_node}'[/dim]") - - # Find moveable candidates (only healthy shards suitable for operations) - candidates = self.analyzer.find_moveable_shards(criteria.min_size, criteria.max_size, criteria.table_name) - - # Filter by node if specified - if criteria.source_node: - candidates = [c for c in candidates if c.node_name == criteria.source_node] - - if not candidates: - if criteria.source_node: - console.print( - f"[yellow]No moveable shards found on node '{criteria.source_node}' " - f"in the specified size range.[/yellow]" - ) - console.print("[dim]Tip: Try different size ranges or remove --node filter to see all candidates[/dim]") - else: - console.print("[yellow]No moveable shards found in the specified size range.[/yellow]") - return - - # Show limited results - shown_candidates = candidates[:limit] - - candidates_table = Table( - title=f"Moveable Shard Candidates (showing {len(shown_candidates)} of {len(candidates)})", box=box.ROUNDED - ) - candidates_table.add_column("Table", style="cyan") - candidates_table.add_column("Shard ID", justify="right", style="magenta") - candidates_table.add_column("Type", style="blue") - candidates_table.add_column("Node", style="green") - candidates_table.add_column("Zone", style="yellow") - candidates_table.add_column("Size", justify="right", style="red") - candidates_table.add_column("Node Free Space", justify="right", style="white") - candidates_table.add_column("Documents", justify="right", style="dim") - - # Create a mapping of node names to available space for display - node_space_map = {node.name: node.available_space_gb for node in self.analyzer.nodes} - - for shard in shown_candidates: - node_free_space = node_space_map.get(shard.node_name, 0) - candidates_table.add_row( - f"{shard.schema_name}.{shard.table_name}", - str(shard.shard_id), - shard.shard_type, - shard.node_name, - shard.zone, - format_size(shard.size_gb), - format_size(node_free_space), - f"{shard.num_docs:,}", - ) - - console.print(candidates_table) - - if len(candidates) > limit: - console.print(f"\n[dim]... and {len(candidates) - limit} more candidates[/dim]") - def validate_move(self, request: ShardMoveRequest): # Parse schema and table if "." not in request.schema_table: diff --git a/cratedb_toolkit/admin/xmover/analyze/shard.py b/cratedb_toolkit/admin/xmover/analysis/shard.py similarity index 100% rename from cratedb_toolkit/admin/xmover/analyze/shard.py rename to cratedb_toolkit/admin/xmover/analysis/shard.py diff --git a/cratedb_toolkit/admin/xmover/analyze/zone.py b/cratedb_toolkit/admin/xmover/analysis/zone.py similarity index 99% rename from cratedb_toolkit/admin/xmover/analyze/zone.py rename to cratedb_toolkit/admin/xmover/analysis/zone.py index 18d032eb..718d88f0 100644 --- a/cratedb_toolkit/admin/xmover/analyze/zone.py +++ b/cratedb_toolkit/admin/xmover/analysis/zone.py @@ -5,7 +5,7 @@ from rich.panel import Panel from rich.table import Table -from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer from cratedb_toolkit.admin.xmover.model import ShardInfo from cratedb_toolkit.admin.xmover.util.database import CrateDBClient diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index d9bb5627..53cce1f9 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -10,16 +10,17 @@ import click from rich.console import Console -from cratedb_toolkit.admin.xmover.analyze.report import ShardReporter -from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer -from cratedb_toolkit.admin.xmover.analyze.zone import ZoneReport +from cratedb_toolkit.admin.xmover.analysis.report import ShardReporter +from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.analysis.zone import ZoneReport from cratedb_toolkit.admin.xmover.model import ( RecommendationConstraints, ShardMoveRequest, SizeCriteria, ) -from cratedb_toolkit.admin.xmover.tune.recommend import Recommender -from cratedb_toolkit.admin.xmover.tune.recover import RecoveryMonitor, RecoveryOptions +from cratedb_toolkit.admin.xmover.operational.candidates import CandidateFinder +from cratedb_toolkit.admin.xmover.operational.recommend import Recommender +from cratedb_toolkit.admin.xmover.operational.recover import RecoveryMonitor, RecoveryOptions from cratedb_toolkit.admin.xmover.util.database import CrateDBClient from cratedb_toolkit.admin.xmover.util.error import explain_cratedb_error @@ -72,8 +73,8 @@ def find_candidates(ctx, min_size: float, max_size: float, limit: int, table: Op """Find shard candidates for movement based on size criteria""" client = ctx.obj["client"] analyzer = ShardAnalyzer(client) - reporter = ShardReporter(analyzer) - reporter.movement_candidates( + finder = CandidateFinder(analyzer) + finder.movement_candidates( criteria=SizeCriteria( min_size=min_size, max_size=max_size, @@ -209,7 +210,7 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node FROM_NODE: Source node name TO_NODE: Target node name - Example: xmover validate-move CUROV.maddoxxFormfactor 4 data-hot-1 data-hot-3 + Example: xmover validate-move CUROV.maddoxxS 4 data-hot-1 data-hot-3 """ client = ctx.obj["client"] analyzer = ShardAnalyzer(client) diff --git a/cratedb_toolkit/admin/xmover/tune/__init__.py b/cratedb_toolkit/admin/xmover/operational/__init__.py similarity index 100% rename from cratedb_toolkit/admin/xmover/tune/__init__.py rename to cratedb_toolkit/admin/xmover/operational/__init__.py diff --git a/cratedb_toolkit/admin/xmover/operational/candidates.py b/cratedb_toolkit/admin/xmover/operational/candidates.py new file mode 100644 index 00000000..dd7d4930 --- /dev/null +++ b/cratedb_toolkit/admin/xmover/operational/candidates.py @@ -0,0 +1,84 @@ +from rich import box +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.model import SizeCriteria +from cratedb_toolkit.admin.xmover.util.format import format_size + +console = Console() + + +class CandidateFinder: + def __init__(self, analyzer: ShardAnalyzer): + self.analyzer = analyzer + + def movement_candidates(self, criteria: SizeCriteria, limit: int): + """ + Find shard candidates for movement based on size criteria + + Results are sorted by nodes with least available space first, + then by shard size (smallest first) for easier moves. + """ + + console.print( + Panel.fit(f"[bold blue]Finding Moveable Shards ({criteria.min_size}-{criteria.max_size}GB)[/bold blue]") + ) + + if criteria.source_node: + console.print(f"[dim]Filtering: Only showing candidates from source node '{criteria.source_node}'[/dim]") + + # Find moveable candidates (only healthy shards suitable for operations) + candidates = self.analyzer.find_moveable_shards(criteria.min_size, criteria.max_size, criteria.table_name) + + # Filter by node if specified + if criteria.source_node: + candidates = [c for c in candidates if c.node_name == criteria.source_node] + + if not candidates: + if criteria.source_node: + console.print( + f"[yellow]No moveable shards found on node '{criteria.source_node}' " + f"in the specified size range.[/yellow]" + ) + console.print("[dim]Tip: Try different size ranges or remove --node filter to see all candidates[/dim]") + else: + console.print("[yellow]No moveable shards found in the specified size range.[/yellow]") + return + + # Show limited results + shown_candidates = candidates[:limit] + + candidates_table = Table( + title=f"Moveable Shard Candidates (showing {len(shown_candidates)} of {len(candidates)})", box=box.ROUNDED + ) + candidates_table.add_column("Table", style="cyan") + candidates_table.add_column("Shard ID", justify="right", style="magenta") + candidates_table.add_column("Type", style="blue") + candidates_table.add_column("Node", style="green") + candidates_table.add_column("Zone", style="yellow") + candidates_table.add_column("Size", justify="right", style="red") + candidates_table.add_column("Node Free Space", justify="right", style="white") + candidates_table.add_column("Documents", justify="right", style="dim") + + # Create a mapping of node names to available space for display + node_space_map = {node.name: node.available_space_gb for node in self.analyzer.nodes} + + for shard in shown_candidates: + node_free_space = node_space_map.get(shard.node_name, 0) + candidates_table.add_row( + f"{shard.schema_name}.{shard.table_name}", + str(shard.shard_id), + shard.shard_type, + shard.node_name, + shard.zone, + format_size(shard.size_gb), + format_size(node_free_space), + f"{shard.num_docs:,}", + ) + + console.print(candidates_table) + + if len(candidates) > limit: + console.print(f"\n[dim]... and {len(candidates) - limit} more candidates[/dim]") diff --git a/cratedb_toolkit/admin/xmover/tune/recommend.py b/cratedb_toolkit/admin/xmover/operational/recommend.py similarity index 99% rename from cratedb_toolkit/admin/xmover/tune/recommend.py rename to cratedb_toolkit/admin/xmover/operational/recommend.py index 704912b1..d7ff0a07 100644 --- a/cratedb_toolkit/admin/xmover/tune/recommend.py +++ b/cratedb_toolkit/admin/xmover/operational/recommend.py @@ -5,9 +5,9 @@ from rich.panel import Panel from rich.table import Table -from cratedb_toolkit.admin.xmover.analyze.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer from cratedb_toolkit.admin.xmover.model import RecommendationConstraints -from cratedb_toolkit.admin.xmover.tune.recover import RecoveryMonitor, RecoveryOptions +from cratedb_toolkit.admin.xmover.operational.recover import RecoveryMonitor, RecoveryOptions from cratedb_toolkit.admin.xmover.util.database import CrateDBClient from cratedb_toolkit.admin.xmover.util.format import format_size diff --git a/cratedb_toolkit/admin/xmover/tune/recover.py b/cratedb_toolkit/admin/xmover/operational/recover.py similarity index 100% rename from cratedb_toolkit/admin/xmover/tune/recover.py rename to cratedb_toolkit/admin/xmover/operational/recover.py diff --git a/pyproject.toml b/pyproject.toml index e296dd5d..fa3309d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -324,12 +324,12 @@ lint.extend-ignore = [ "S108", ] -lint.per-file-ignores."cratedb_toolkit/admin/xmover/analyze/shard.py" = [ "T201" ] # Allow `print` -lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` -lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused +lint.per-file-ignores."cratedb_toolkit/admin/xmover/analysis/shard.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused lint.per-file-ignores."doc/conf.py" = [ "A001", "ERA001" ] -lint.per-file-ignores."examples/*" = [ "ERA001", "F401", "T201", "T203" ] # Allow `print` and `pprint` -lint.per-file-ignores."tests/*" = [ "S101" ] # Allow use of `assert`, and `print`. +lint.per-file-ignores."examples/*" = [ "ERA001", "F401", "T201", "T203" ] # Allow `print` and `pprint` +lint.per-file-ignores."tests/*" = [ "S101" ] # Allow use of `assert`, and `print`. lint.per-file-ignores."tests/adapter/test_rockset.py" = [ "E402" ] lint.per-file-ignores."tests/info/test_http.py" = [ "E402" ] From d009652bdd31a6cc8b7865ce44eeaa3ce228e237 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 21 Aug 2025 02:42:24 +0200 Subject: [PATCH 13/18] Admin/XMover: Naming things. OO. --- .../admin/xmover/analysis/report.py | 256 ------------------ .../admin/xmover/analysis/shard.py | 145 +++++++++- cratedb_toolkit/admin/xmover/cli.py | 27 +- cratedb_toolkit/admin/xmover/model.py | 10 +- .../admin/xmover/operational/recommend.py | 177 ++++++++++-- 5 files changed, 305 insertions(+), 310 deletions(-) delete mode 100644 cratedb_toolkit/admin/xmover/analysis/report.py diff --git a/cratedb_toolkit/admin/xmover/analysis/report.py b/cratedb_toolkit/admin/xmover/analysis/report.py deleted file mode 100644 index f6faf879..00000000 --- a/cratedb_toolkit/admin/xmover/analysis/report.py +++ /dev/null @@ -1,256 +0,0 @@ -from typing import Any, Dict - -from rich import box -from rich.console import Console -from rich.panel import Panel -from rich.table import Table - -from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer -from cratedb_toolkit.admin.xmover.model import ShardMoveRecommendation, ShardMoveRequest -from cratedb_toolkit.admin.xmover.util.format import format_percentage, format_size - -console = Console() - - -class ShardReporter: - def __init__(self, analyzer: ShardAnalyzer): - self.analyzer = analyzer - - def distribution(self, table: str = None): - """Analyze current shard distribution across nodes and zones""" - console.print(Panel.fit("[bold blue]CrateDB Cluster Analysis[/bold blue]")) - - # Get cluster overview (includes all shards for complete analysis) - overview: Dict[str, Any] = self.analyzer.get_cluster_overview() - - # Cluster summary table - summary_table = Table(title="Cluster Summary", box=box.ROUNDED) - summary_table.add_column("Metric", style="cyan") - summary_table.add_column("Value", style="magenta") - - summary_table.add_row("Nodes", str(overview["nodes"])) - summary_table.add_row("Availability Zones", str(overview["zones"])) - summary_table.add_row("Total Shards", str(overview["total_shards"])) - summary_table.add_row("Primary Shards", str(overview["primary_shards"])) - summary_table.add_row("Replica Shards", str(overview["replica_shards"])) - summary_table.add_row("Total Size", format_size(overview["total_size_gb"])) - - console.print(summary_table) - console.print() - - # Disk watermarks table - if overview.get("watermarks"): - watermarks_table = Table(title="Disk Allocation Watermarks", box=box.ROUNDED) - watermarks_table.add_column("Setting", style="cyan") - watermarks_table.add_column("Value", style="magenta") - - watermarks = overview["watermarks"] - watermarks_table.add_row("Low Watermark", str(watermarks.get("low", "Not set"))) - watermarks_table.add_row("High Watermark", str(watermarks.get("high", "Not set"))) - watermarks_table.add_row("Flood Stage", str(watermarks.get("flood_stage", "Not set"))) - watermarks_table.add_row( - "Enable for Single Node", str(watermarks.get("enable_for_single_data_node", "Not set")) - ) - - console.print(watermarks_table) - console.print() - - # Zone distribution table - zone_table = Table(title="Zone Distribution", box=box.ROUNDED) - zone_table.add_column("Zone", style="cyan") - zone_table.add_column("Shards", justify="right", style="magenta") - zone_table.add_column("Percentage", justify="right", style="green") - - total_shards = overview["total_shards"] - for zone, count in overview["zone_distribution"].items(): - percentage = (count / total_shards * 100) if total_shards > 0 else 0 - zone_table.add_row(zone, str(count), f"{percentage:.1f}%") - - console.print(zone_table) - console.print() - - # Node health table - node_table = Table(title="Node Health", box=box.ROUNDED) - node_table.add_column("Node", style="cyan") - node_table.add_column("Zone", style="blue") - node_table.add_column("Shards", justify="right", style="magenta") - node_table.add_column("Size", justify="right", style="green") - node_table.add_column("Disk Usage", justify="right") - node_table.add_column("Available Space", justify="right", style="green") - node_table.add_column("Until Low WM", justify="right", style="yellow") - node_table.add_column("Until High WM", justify="right", style="red") - - for node_info in overview["node_health"]: - # Format watermark remaining capacity - low_wm_remaining = ( - format_size(node_info["remaining_to_low_watermark_gb"]) - if node_info["remaining_to_low_watermark_gb"] > 0 - else "[red]Exceeded[/red]" - ) - high_wm_remaining = ( - format_size(node_info["remaining_to_high_watermark_gb"]) - if node_info["remaining_to_high_watermark_gb"] > 0 - else "[red]Exceeded[/red]" - ) - - node_table.add_row( - node_info["name"], - node_info["zone"], - str(node_info["shards"]), - format_size(node_info["size_gb"]), - format_percentage(node_info["disk_usage_percent"]), - format_size(node_info["available_space_gb"]), - low_wm_remaining, - high_wm_remaining, - ) - - console.print(node_table) - - # Table-specific analysis if requested - if table: - console.print() - console.print(Panel.fit(f"[bold blue]Analysis for table: {table}[/bold blue]")) - - stats = self.analyzer.analyze_distribution(table) - - table_summary = Table(title=f"Table {table} Distribution", box=box.ROUNDED) - table_summary.add_column("Metric", style="cyan") - table_summary.add_column("Value", style="magenta") - - table_summary.add_row("Total Shards", str(stats.total_shards)) - table_summary.add_row("Total Size", format_size(stats.total_size_gb)) - table_summary.add_row("Zone Balance Score", f"{stats.zone_balance_score:.1f}/100") - table_summary.add_row("Node Balance Score", f"{stats.node_balance_score:.1f}/100") - - console.print(table_summary) - - def validate_move(self, request: ShardMoveRequest): - # Parse schema and table - if "." not in request.schema_table: - console.print("[red]Error: Schema and table must be in format 'schema.table'[/red]") - return - - schema_name, table_name = request.schema_table.split(".", 1) - - console.print(Panel.fit("[bold blue]Validating Shard Move[/bold blue]")) - console.print( - f"[dim]Move: {schema_name}.{table_name}[{request.shard_id}] " - f"from {request.from_node} to {request.to_node}[/dim]" - ) - console.print() - - # Find the nodes - from_node_info = None - to_node_info = None - for node in self.analyzer.nodes: - if node.name == request.from_node: - from_node_info = node - if node.name == request.to_node: - to_node_info = node - - if not from_node_info: - console.print(f"[red]βœ— Source node '{request.from_node}' not found in cluster[/red]") - return - - if not to_node_info: - console.print(f"[red]βœ— Target node '{request.to_node}' not found in cluster[/red]") - return - - # Find the specific shard - target_shard = None - for shard in self.analyzer.shards: - if ( - shard.schema_name == schema_name - and shard.table_name == table_name - and shard.shard_id == request.shard_id - and shard.node_name == request.from_node - ): - target_shard = shard - break - - if not target_shard: - console.print(f"[red]βœ— Shard {request.shard_id} not found on node {request.from_node}[/red]") - console.print("[dim]Use 'xmover find-candidates' to see available shards[/dim]") - return - - # Create a move recommendation for validation - recommendation = ShardMoveRecommendation( - table_name=table_name, - schema_name=schema_name, - shard_id=request.shard_id, - from_node=request.from_node, - to_node=request.to_node, - from_zone=from_node_info.zone, - to_zone=to_node_info.zone, - shard_type=target_shard.shard_type, - size_gb=target_shard.size_gb, - reason="Manual validation", - ) - - # Display shard details - details_table = Table(title="Shard Details", box=box.ROUNDED) - details_table.add_column("Property", style="cyan") - details_table.add_column("Value", style="magenta") - - details_table.add_row("Table", f"{schema_name}.{table_name}") - details_table.add_row("Shard ID", str(request.shard_id)) - details_table.add_row("Type", target_shard.shard_type) - details_table.add_row("Size", format_size(target_shard.size_gb)) - details_table.add_row("Documents", f"{target_shard.num_docs:,}") - details_table.add_row("State", target_shard.state) - details_table.add_row("Routing State", target_shard.routing_state) - details_table.add_row("From Node", f"{request.from_node} ({from_node_info.zone})") - details_table.add_row("To Node", f"{request.to_node} ({to_node_info.zone})") - details_table.add_row("Zone Change", "Yes" if from_node_info.zone != to_node_info.zone else "No") - - console.print(details_table) - console.print() - - # Perform comprehensive validation - is_safe, safety_msg = self.analyzer.validate_move_safety( - recommendation, max_disk_usage_percent=request.max_disk_usage - ) - - if is_safe: - console.print("[green]βœ“ VALIDATION PASSED - Move appears safe[/green]") - console.print(f"[green]βœ“ {safety_msg}[/green]") - console.print() - - # Show the SQL command - console.print(Panel.fit("[bold green]Ready to Execute[/bold green]")) - console.print("[dim]# Copy and paste this command to execute the move[/dim]") - console.print() - console.print(f"{recommendation.to_sql()}") - console.print() - console.print("[dim]# Monitor shard health after execution[/dim]") - console.print( - "[dim]# Check with: SELECT * FROM sys.shards " - "WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]" - ) - else: - console.print("[red]βœ— VALIDATION FAILED - Move not safe[/red]") - console.print(f"[red]βœ— {safety_msg}[/red]") - console.print() - - # Provide troubleshooting guidance - if "zone conflict" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting Zone Conflicts:[/yellow]") - console.print(" β€’ Check current shard distribution: xmover zone-analysis --show-shards") - console.print(" β€’ Try moving to a different zone") - console.print(" β€’ Verify cluster has proper zone-awareness configuration") - elif "node conflict" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting Node Conflicts:[/yellow]") - console.print(" β€’ The target node already has a copy of this shard") - console.print(" β€’ Choose a different target node") - console.print(" β€’ Check shard distribution: xmover analyze") - elif "space" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting Space Issues:[/yellow]") - console.print(" β€’ Free up space on the target node") - console.print(" β€’ Choose a node with more available capacity") - console.print(" β€’ Check node capacity: xmover analyze") - elif "usage" in safety_msg.lower(): - console.print("[yellow]πŸ’‘ Troubleshooting High Disk Usage:[/yellow]") - console.print(" β€’ Wait for target node disk usage to decrease") - console.print(" β€’ Choose a node with lower disk usage") - console.print(" β€’ Check cluster health: xmover analyze") - console.print(" β€’ Consider using --max-disk-usage option for urgent moves") diff --git a/cratedb_toolkit/admin/xmover/analysis/shard.py b/cratedb_toolkit/admin/xmover/analysis/shard.py index 334b394e..f6f24b6b 100644 --- a/cratedb_toolkit/admin/xmover/analysis/shard.py +++ b/cratedb_toolkit/admin/xmover/analysis/shard.py @@ -7,17 +7,25 @@ from collections import defaultdict from typing import Any, Dict, List, Optional, Set, Tuple, Union +from rich import box +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + from cratedb_toolkit.admin.xmover.model import ( DistributionStats, NodeInfo, - RecommendationConstraints, ShardInfo, - ShardMoveRecommendation, + ShardRelocationConstraints, + ShardRelocationResponse, ) from cratedb_toolkit.admin.xmover.util.database import CrateDBClient +from cratedb_toolkit.admin.xmover.util.format import format_percentage, format_size logger = logging.getLogger(__name__) +console = Console() + class ShardAnalyzer: """Analyzer for CrateDB shard distribution and rebalancing""" @@ -181,8 +189,8 @@ def find_nodes_with_capacity( return available_nodes def generate_rebalancing_recommendations( - self, constraints: RecommendationConstraints - ) -> List[ShardMoveRecommendation]: + self, constraints: ShardRelocationConstraints + ) -> List[ShardRelocationResponse]: """Generate recommendations for rebalancing shards Args: @@ -191,7 +199,7 @@ def generate_rebalancing_recommendations( source_node: If specified, only generate recommendations for shards on this node max_disk_usage_percent: Maximum disk usage percentage for target nodes """ - recommendations: List[ShardMoveRecommendation] = [] + recommendations: List[ShardRelocationResponse] = [] # Get moveable shards (only healthy ones for actual operations) moveable_shards = self.find_moveable_shards(constraints.min_size, constraints.max_size, constraints.table_name) @@ -279,7 +287,7 @@ def generate_rebalancing_recommendations( safe_target_nodes = [] for candidate_node in target_nodes: # Create a temporary recommendation to test safety - temp_rec = ShardMoveRecommendation( + temp_rec = ShardRelocationResponse( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, @@ -341,7 +349,7 @@ def generate_rebalancing_recommendations( if shard.zone == target_node.zone: reason = f"Node balancing within {shard.zone}" - recommendation = ShardMoveRecommendation( + recommendation = ShardRelocationResponse( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, @@ -363,7 +371,7 @@ def generate_rebalancing_recommendations( return recommendations def validate_move_safety( - self, recommendation: ShardMoveRecommendation, max_disk_usage_percent: float = 90.0 + self, recommendation: ShardRelocationResponse, max_disk_usage_percent: float = 90.0 ) -> Tuple[bool, str]: """Validate that a move recommendation is safe to execute""" # Find target node (with caching) @@ -409,7 +417,7 @@ def _get_node_cached(self, node_name: str): self._node_lookup_cache[node_name] = target_node return target_node - def _check_zone_conflict_cached(self, recommendation: ShardMoveRecommendation) -> Optional[str]: + def _check_zone_conflict_cached(self, recommendation: ShardRelocationResponse) -> Optional[str]: """Check zone conflicts with caching""" # Create cache key: table, shard, target zone target_zone = self._get_node_zone(recommendation.to_node) @@ -467,7 +475,7 @@ def _find_nodes_with_capacity_cached( self._target_nodes_cache[cache_key] = result return result - def _check_zone_conflict(self, recommendation: ShardMoveRecommendation) -> Optional[str]: + def _check_zone_conflict(self, recommendation: ShardRelocationResponse) -> Optional[str]: """Check if moving this shard would create a zone conflict Performs comprehensive zone safety analysis: @@ -753,7 +761,7 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100. safe_targets = [] for target in potential_targets: # Create a temporary recommendation to test zone safety - temp_rec = ShardMoveRecommendation( + temp_rec = ShardRelocationResponse( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, @@ -778,7 +786,7 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100. # Choose the target with most available space best_target = safe_targets[0] move_plan.append( - ShardMoveRecommendation( + ShardRelocationResponse( table_name=shard.table_name, schema_name=shard.schema_name, shard_id=shard.shard_id, @@ -826,3 +834,116 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100. "estimated_time_hours": len(move_plan) * 0.1, # Rough estimate: 6 minutes per move "message": "Decommission plan generated" if feasible else "Decommission not currently feasible", } + + +class ShardReporter: + def __init__(self, analyzer: ShardAnalyzer): + self.analyzer = analyzer + + def distribution(self, table: str = None): + """Analyze current shard distribution across nodes and zones""" + console.print(Panel.fit("[bold blue]CrateDB Cluster Analysis[/bold blue]")) + + # Get cluster overview (includes all shards for complete analysis) + overview: Dict[str, Any] = self.analyzer.get_cluster_overview() + + # Cluster summary table + summary_table = Table(title="Cluster Summary", box=box.ROUNDED) + summary_table.add_column("Metric", style="cyan") + summary_table.add_column("Value", style="magenta") + + summary_table.add_row("Nodes", str(overview["nodes"])) + summary_table.add_row("Availability Zones", str(overview["zones"])) + summary_table.add_row("Total Shards", str(overview["total_shards"])) + summary_table.add_row("Primary Shards", str(overview["primary_shards"])) + summary_table.add_row("Replica Shards", str(overview["replica_shards"])) + summary_table.add_row("Total Size", format_size(overview["total_size_gb"])) + + console.print(summary_table) + console.print() + + # Disk watermarks table + if overview.get("watermarks"): + watermarks_table = Table(title="Disk Allocation Watermarks", box=box.ROUNDED) + watermarks_table.add_column("Setting", style="cyan") + watermarks_table.add_column("Value", style="magenta") + + watermarks = overview["watermarks"] + watermarks_table.add_row("Low Watermark", str(watermarks.get("low", "Not set"))) + watermarks_table.add_row("High Watermark", str(watermarks.get("high", "Not set"))) + watermarks_table.add_row("Flood Stage", str(watermarks.get("flood_stage", "Not set"))) + watermarks_table.add_row( + "Enable for Single Node", str(watermarks.get("enable_for_single_data_node", "Not set")) + ) + + console.print(watermarks_table) + console.print() + + # Zone distribution table + zone_table = Table(title="Zone Distribution", box=box.ROUNDED) + zone_table.add_column("Zone", style="cyan") + zone_table.add_column("Shards", justify="right", style="magenta") + zone_table.add_column("Percentage", justify="right", style="green") + + total_shards = overview["total_shards"] + for zone, count in overview["zone_distribution"].items(): + percentage = (count / total_shards * 100) if total_shards > 0 else 0 + zone_table.add_row(zone, str(count), f"{percentage:.1f}%") + + console.print(zone_table) + console.print() + + # Node health table + node_table = Table(title="Node Health", box=box.ROUNDED) + node_table.add_column("Node", style="cyan") + node_table.add_column("Zone", style="blue") + node_table.add_column("Shards", justify="right", style="magenta") + node_table.add_column("Size", justify="right", style="green") + node_table.add_column("Disk Usage", justify="right") + node_table.add_column("Available Space", justify="right", style="green") + node_table.add_column("Until Low WM", justify="right", style="yellow") + node_table.add_column("Until High WM", justify="right", style="red") + + for node_info in overview["node_health"]: + # Format watermark remaining capacity + low_wm_remaining = ( + format_size(node_info["remaining_to_low_watermark_gb"]) + if node_info["remaining_to_low_watermark_gb"] > 0 + else "[red]Exceeded[/red]" + ) + high_wm_remaining = ( + format_size(node_info["remaining_to_high_watermark_gb"]) + if node_info["remaining_to_high_watermark_gb"] > 0 + else "[red]Exceeded[/red]" + ) + + node_table.add_row( + node_info["name"], + node_info["zone"], + str(node_info["shards"]), + format_size(node_info["size_gb"]), + format_percentage(node_info["disk_usage_percent"]), + format_size(node_info["available_space_gb"]), + low_wm_remaining, + high_wm_remaining, + ) + + console.print(node_table) + + # Table-specific analysis if requested + if table: + console.print() + console.print(Panel.fit(f"[bold blue]Analysis for table: {table}[/bold blue]")) + + stats = self.analyzer.analyze_distribution(table) + + table_summary = Table(title=f"Table {table} Distribution", box=box.ROUNDED) + table_summary.add_column("Metric", style="cyan") + table_summary.add_column("Value", style="magenta") + + table_summary.add_row("Total Shards", str(stats.total_shards)) + table_summary.add_row("Total Size", format_size(stats.total_size_gb)) + table_summary.add_row("Zone Balance Score", f"{stats.zone_balance_score:.1f}/100") + table_summary.add_row("Node Balance Score", f"{stats.node_balance_score:.1f}/100") + + console.print(table_summary) diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 53cce1f9..f77259b6 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -10,16 +10,15 @@ import click from rich.console import Console -from cratedb_toolkit.admin.xmover.analysis.report import ShardReporter -from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer +from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer, ShardReporter from cratedb_toolkit.admin.xmover.analysis.zone import ZoneReport from cratedb_toolkit.admin.xmover.model import ( - RecommendationConstraints, - ShardMoveRequest, + ShardRelocationConstraints, + ShardRelocationRequest, SizeCriteria, ) from cratedb_toolkit.admin.xmover.operational.candidates import CandidateFinder -from cratedb_toolkit.admin.xmover.operational.recommend import Recommender +from cratedb_toolkit.admin.xmover.operational.recommend import ShardRelocationRecommender from cratedb_toolkit.admin.xmover.operational.recover import RecoveryMonitor, RecoveryOptions from cratedb_toolkit.admin.xmover.util.database import CrateDBClient from cratedb_toolkit.admin.xmover.util.error import explain_cratedb_error @@ -128,9 +127,9 @@ def recommend( auto_execute: bool, ): """Generate shard movement recommendations for rebalancing""" - recommender = Recommender( - client=ctx.obj["client"], - constraints=RecommendationConstraints( + recommender = ShardRelocationRecommender(client=ctx.obj["client"]) + recommender.execute( + constraints=ShardRelocationConstraints( table_name=table, source_node=node, min_size=min_size, @@ -141,8 +140,10 @@ def recommend( max_disk_usage=max_disk_usage, prioritize_space=prioritize_space, ), + auto_execute=auto_execute, + validate=validate, + dry_run=dry_run, ) - recommender.start(auto_execute=auto_execute, validate=validate, dry_run=dry_run) @main.command() @@ -212,11 +213,9 @@ def validate_move(ctx, schema_table: str, shard_id: int, from_node: str, to_node Example: xmover validate-move CUROV.maddoxxS 4 data-hot-1 data-hot-3 """ - client = ctx.obj["client"] - analyzer = ShardAnalyzer(client) - reporter = ShardReporter(analyzer) - reporter.validate_move( - request=ShardMoveRequest( + recommender = ShardRelocationRecommender(client=ctx.obj["client"]) + recommender.validate( + request=ShardRelocationRequest( schema_table=schema_table, shard_id=shard_id, from_node=from_node, diff --git a/cratedb_toolkit/admin/xmover/model.py b/cratedb_toolkit/admin/xmover/model.py index 056d834c..34e43f77 100644 --- a/cratedb_toolkit/admin/xmover/model.py +++ b/cratedb_toolkit/admin/xmover/model.py @@ -103,7 +103,7 @@ def translog_percentage(self) -> float: @dataclass -class ShardMoveRequest: +class ShardRelocationRequest: """Request for moving a shard""" schema_table: str @@ -114,7 +114,7 @@ class ShardMoveRequest: @dataclass -class ShardMoveRecommendation: +class ShardRelocationResponse: """Recommendation for moving a shard""" table_name: str @@ -174,9 +174,9 @@ class SizeCriteria: @dataclasses.dataclass -class RecommendationConstraints: - min_size: float = 40.0 - max_size: float = 60.0 +class ShardRelocationConstraints: + min_size: float = SizeCriteria().min_size + max_size: float = SizeCriteria().max_size table_name: Optional[str] = None source_node: Optional[str] = None zone_tolerance: float = 10.0 diff --git a/cratedb_toolkit/admin/xmover/operational/recommend.py b/cratedb_toolkit/admin/xmover/operational/recommend.py index d7ff0a07..8eb37b13 100644 --- a/cratedb_toolkit/admin/xmover/operational/recommend.py +++ b/cratedb_toolkit/admin/xmover/operational/recommend.py @@ -6,7 +6,11 @@ from rich.table import Table from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer -from cratedb_toolkit.admin.xmover.model import RecommendationConstraints +from cratedb_toolkit.admin.xmover.model import ( + ShardRelocationConstraints, + ShardRelocationRequest, + ShardRelocationResponse, +) from cratedb_toolkit.admin.xmover.operational.recover import RecoveryMonitor, RecoveryOptions from cratedb_toolkit.admin.xmover.util.database import CrateDBClient from cratedb_toolkit.admin.xmover.util.format import format_size @@ -14,14 +18,145 @@ console = Console() -class Recommender: - def __init__(self, client: CrateDBClient, constraints: RecommendationConstraints): +class ShardRelocationRecommender: + def __init__(self, client: CrateDBClient): self.client = client - self.constraints = constraints self.analyzer = ShardAnalyzer(self.client) - def start( + def validate(self, request: ShardRelocationRequest): + # Parse schema and table + if "." not in request.schema_table: + console.print("[red]Error: Schema and table must be in format 'schema.table'[/red]") + return + + schema_name, table_name = request.schema_table.split(".", 1) + + console.print(Panel.fit("[bold blue]Validating Shard Move[/bold blue]")) + console.print( + f"[dim]Move: {schema_name}.{table_name}[{request.shard_id}] " + f"from {request.from_node} to {request.to_node}[/dim]" + ) + console.print() + + # Find the nodes + from_node_info = None + to_node_info = None + for node in self.analyzer.nodes: + if node.name == request.from_node: + from_node_info = node + if node.name == request.to_node: + to_node_info = node + + if not from_node_info: + console.print(f"[red]βœ— Source node '{request.from_node}' not found in cluster[/red]") + return + + if not to_node_info: + console.print(f"[red]βœ— Target node '{request.to_node}' not found in cluster[/red]") + return + + # Find the specific shard + target_shard = None + for shard in self.analyzer.shards: + if ( + shard.schema_name == schema_name + and shard.table_name == table_name + and shard.shard_id == request.shard_id + and shard.node_name == request.from_node + ): + target_shard = shard + break + + if not target_shard: + console.print(f"[red]βœ— Shard {request.shard_id} not found on node {request.from_node}[/red]") + console.print("[dim]Use 'xmover find-candidates' to see available shards[/dim]") + return + + # Create a move recommendation for validation + recommendation = ShardRelocationResponse( + table_name=table_name, + schema_name=schema_name, + shard_id=request.shard_id, + from_node=request.from_node, + to_node=request.to_node, + from_zone=from_node_info.zone, + to_zone=to_node_info.zone, + shard_type=target_shard.shard_type, + size_gb=target_shard.size_gb, + reason="Manual validation", + ) + + # Display shard details + details_table = Table(title="Shard Details", box=box.ROUNDED) + details_table.add_column("Property", style="cyan") + details_table.add_column("Value", style="magenta") + + details_table.add_row("Table", f"{schema_name}.{table_name}") + details_table.add_row("Shard ID", str(request.shard_id)) + details_table.add_row("Type", target_shard.shard_type) + details_table.add_row("Size", format_size(target_shard.size_gb)) + details_table.add_row("Documents", f"{target_shard.num_docs:,}") + details_table.add_row("State", target_shard.state) + details_table.add_row("Routing State", target_shard.routing_state) + details_table.add_row("From Node", f"{request.from_node} ({from_node_info.zone})") + details_table.add_row("To Node", f"{request.to_node} ({to_node_info.zone})") + details_table.add_row("Zone Change", "Yes" if from_node_info.zone != to_node_info.zone else "No") + + console.print(details_table) + console.print() + + # Perform comprehensive validation + is_safe, safety_msg = self.analyzer.validate_move_safety( + recommendation, max_disk_usage_percent=request.max_disk_usage + ) + + if is_safe: + console.print("[green]βœ“ VALIDATION PASSED - Move appears safe[/green]") + console.print(f"[green]βœ“ {safety_msg}[/green]") + console.print() + + # Show the SQL command + console.print(Panel.fit("[bold green]Ready to Execute[/bold green]")) + console.print("[dim]# Copy and paste this command to execute the move[/dim]") + console.print() + console.print(f"{recommendation.to_sql()}") + console.print() + console.print("[dim]# Monitor shard health after execution[/dim]") + console.print( + "[dim]# Check with: SELECT * FROM sys.shards " + "WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]" + ) + else: + console.print("[red]βœ— VALIDATION FAILED - Move not safe[/red]") + console.print(f"[red]βœ— {safety_msg}[/red]") + console.print() + + # Provide troubleshooting guidance + if "zone conflict" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Zone Conflicts:[/yellow]") + console.print(" β€’ Check current shard distribution: xmover zone-analysis --show-shards") + console.print(" β€’ Try moving to a different zone") + console.print(" β€’ Verify cluster has proper zone-awareness configuration") + elif "node conflict" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Node Conflicts:[/yellow]") + console.print(" β€’ The target node already has a copy of this shard") + console.print(" β€’ Choose a different target node") + console.print(" β€’ Check shard distribution: xmover analyze") + elif "space" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting Space Issues:[/yellow]") + console.print(" β€’ Free up space on the target node") + console.print(" β€’ Choose a node with more available capacity") + console.print(" β€’ Check node capacity: xmover analyze") + elif "usage" in safety_msg.lower(): + console.print("[yellow]πŸ’‘ Troubleshooting High Disk Usage:[/yellow]") + console.print(" β€’ Wait for target node disk usage to decrease") + console.print(" β€’ Choose a node with lower disk usage") + console.print(" β€’ Check cluster health: xmover analyze") + console.print(" β€’ Consider using --max-disk-usage option for urgent moves") + + def execute( self, + constraints: ShardRelocationConstraints, auto_execute: bool, validate: bool, dry_run: bool, @@ -41,17 +176,17 @@ def start( ) console.print("[dim]Note: Only analyzing healthy shards (STARTED + 100% recovered) for safe operations[/dim]") console.print("[dim]Zone conflict detection: Prevents moves that would violate CrateDB's zone awareness[/dim]") - if self.constraints.prioritize_space: + if constraints.prioritize_space: console.print("[dim]Mode: Prioritizing available space over zone balancing[/dim]") else: console.print("[dim]Mode: Prioritizing zone balancing over available space[/dim]") - if self.constraints.source_node: - console.print(f"[dim]Filtering: Only showing moves from source node '{self.constraints.source_node}'[/dim]") + if constraints.source_node: + console.print(f"[dim]Filtering: Only showing moves from source node '{constraints.source_node}'[/dim]") console.print( - f"[dim]Safety thresholds: Max disk usage {self.constraints.max_disk_usage}%, " - f"Min free space {self.constraints.min_free_space}GB[/dim]" + f"[dim]Safety thresholds: Max disk usage {constraints.max_disk_usage}%, " + f"Min free space {constraints.min_free_space}GB[/dim]" ) if dry_run: @@ -60,24 +195,20 @@ def start( console.print("[red]EXECUTION MODE - SQL commands will be generated for actual moves[/red]") console.print() - recommendations = self.analyzer.generate_rebalancing_recommendations(constraints=self.constraints) + recommendations = self.analyzer.generate_rebalancing_recommendations(constraints=constraints) if not recommendations: - if self.constraints.source_node: - console.print( - f"[yellow]No safe recommendations found for node '{self.constraints.source_node}'[/yellow]" - ) + if constraints.source_node: + console.print(f"[yellow]No safe recommendations found for node '{constraints.source_node}'[/yellow]") console.print("[dim]This could be due to:[/dim]") console.print("[dim] β€’ Zone conflicts preventing safe moves[/dim]") console.print( - f"[dim] β€’ Target nodes exceeding {self.constraints.max_disk_usage}% disk usage threshold[/dim]" - ) - console.print( - f"[dim] β€’ Insufficient free space on target nodes (need {self.constraints.min_free_space}GB)[/dim]" + f"[dim] β€’ Target nodes exceeding {constraints.max_disk_usage}% disk usage threshold[/dim]" ) console.print( - f"[dim] β€’ No shards in size range {self.constraints.min_size}-{self.constraints.max_size}GB[/dim]" + f"[dim] β€’ Insufficient free space on target nodes (need {constraints.min_free_space}GB)[/dim]" ) + console.print(f"[dim] β€’ No shards in size range {constraints.min_size}-{constraints.max_size}GB[/dim]") console.print("[dim]Suggestions:[/dim]") console.print("[dim] β€’ Try: --max-disk-usage 95 (allow higher disk usage)[/dim]") console.print("[dim] β€’ Try: --min-free-space 50 (reduce space requirements)[/dim]") @@ -121,7 +252,7 @@ def start( if validate: is_safe, safety_msg = self.analyzer.validate_move_safety( - rec, max_disk_usage_percent=self.constraints.max_disk_usage + rec, max_disk_usage_percent=constraints.max_disk_usage ) safety_status = "[green]βœ“ SAFE[/green]" if is_safe else f"[red]βœ— {safety_msg}[/red]" row.append(safety_status) @@ -145,7 +276,7 @@ def start( for i, rec in enumerate(recommendations, 1): if validate: is_safe, safety_msg = self.analyzer.validate_move_safety( - rec, max_disk_usage_percent=self.constraints.max_disk_usage + rec, max_disk_usage_percent=constraints.max_disk_usage ) if not is_safe: if "zone conflict" in safety_msg.lower(): @@ -189,7 +320,7 @@ def start( for i, rec in enumerate(recommendations, 1): if validate: is_safe, safety_msg = self.analyzer.validate_move_safety( - rec, max_disk_usage_percent=self.constraints.max_disk_usage + rec, max_disk_usage_percent=constraints.max_disk_usage ) if not is_safe: if "Zone conflict" in safety_msg: From eb7aa7ddfb9146caf35f9f357e7f6fe4e3ded637 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 21 Aug 2025 02:45:26 +0200 Subject: [PATCH 14/18] Admin/XMover: Naming things. s/recover/monitor/ --- cratedb_toolkit/admin/xmover/cli.py | 2 +- .../admin/xmover/operational/{recover.py => monitor.py} | 0 cratedb_toolkit/admin/xmover/operational/recommend.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename cratedb_toolkit/admin/xmover/operational/{recover.py => monitor.py} (100%) diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index f77259b6..339f9e7f 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -18,8 +18,8 @@ SizeCriteria, ) from cratedb_toolkit.admin.xmover.operational.candidates import CandidateFinder +from cratedb_toolkit.admin.xmover.operational.monitor import RecoveryMonitor, RecoveryOptions from cratedb_toolkit.admin.xmover.operational.recommend import ShardRelocationRecommender -from cratedb_toolkit.admin.xmover.operational.recover import RecoveryMonitor, RecoveryOptions from cratedb_toolkit.admin.xmover.util.database import CrateDBClient from cratedb_toolkit.admin.xmover.util.error import explain_cratedb_error diff --git a/cratedb_toolkit/admin/xmover/operational/recover.py b/cratedb_toolkit/admin/xmover/operational/monitor.py similarity index 100% rename from cratedb_toolkit/admin/xmover/operational/recover.py rename to cratedb_toolkit/admin/xmover/operational/monitor.py diff --git a/cratedb_toolkit/admin/xmover/operational/recommend.py b/cratedb_toolkit/admin/xmover/operational/recommend.py index 8eb37b13..ab5156e6 100644 --- a/cratedb_toolkit/admin/xmover/operational/recommend.py +++ b/cratedb_toolkit/admin/xmover/operational/recommend.py @@ -11,7 +11,7 @@ ShardRelocationRequest, ShardRelocationResponse, ) -from cratedb_toolkit.admin.xmover.operational.recover import RecoveryMonitor, RecoveryOptions +from cratedb_toolkit.admin.xmover.operational.monitor import RecoveryMonitor, RecoveryOptions from cratedb_toolkit.admin.xmover.util.database import CrateDBClient from cratedb_toolkit.admin.xmover.util.format import format_size From 46e9c00e7fe92c515e2516a1e9ae282ea38f065d Mon Sep 17 00:00:00 2001 From: Walter Behmann Date: Thu, 21 Aug 2025 13:55:25 +0200 Subject: [PATCH 15/18] Admin/XMover: Suppress SSL warnings when SSL verification is disabled --- cratedb_toolkit/admin/xmover/util/database.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cratedb_toolkit/admin/xmover/util/database.py b/cratedb_toolkit/admin/xmover/util/database.py index 1cb16bb1..21950ab0 100644 --- a/cratedb_toolkit/admin/xmover/util/database.py +++ b/cratedb_toolkit/admin/xmover/util/database.py @@ -7,6 +7,7 @@ from typing import Any, Dict, List, Optional, Union import requests +import urllib3 from dotenv import load_dotenv from cratedb_toolkit.admin.xmover.model import NodeInfo, RecoveryInfo, ShardInfo @@ -30,6 +31,10 @@ def __init__(self, connection_string: Optional[str] = None): self.password = os.getenv("CRATE_PASSWORD") self.ssl_verify = os.getenv("CRATE_SSL_VERIFY", "true").lower() == "true" + # Suppress SSL warnings when SSL verification is disabled + if not self.ssl_verify: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + # Ensure connection string ends with _sql endpoint if not self.connection_string.endswith("/_sql"): self.connection_string = self.connection_string.rstrip("/") + "/_sql" From 3704c96d5263f4033a50ce1275ceaa28b1fe08c8 Mon Sep 17 00:00:00 2001 From: Walter Behmann Date: Thu, 21 Aug 2025 14:02:46 +0200 Subject: [PATCH 16/18] Admin/XMover: Add shard distribution analysis for (large) tables --- .../admin/xmover/analysis/table.py | 783 ++++++++++++++++++ cratedb_toolkit/admin/xmover/cli.py | 64 ++ doc/admin/xmover/handbook.md | 13 + doc/admin/xmover/index.md | 1 + doc/admin/xmover/queries.md | 31 + tests/admin/test_cli.py | 1 + 6 files changed, 893 insertions(+) create mode 100644 cratedb_toolkit/admin/xmover/analysis/table.py diff --git a/cratedb_toolkit/admin/xmover/analysis/table.py b/cratedb_toolkit/admin/xmover/analysis/table.py new file mode 100644 index 00000000..b8f1a7ce --- /dev/null +++ b/cratedb_toolkit/admin/xmover/analysis/table.py @@ -0,0 +1,783 @@ +""" +Shard Distribution Analysis for CrateDB Clusters + +This module analyzes shard distribution across nodes to detect imbalances +and provide recommendations for optimization. +""" + +import statistics +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +from rich import print as rprint +from rich.console import Console +from rich.table import Table + +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient + + +def format_storage_size(size_gb: float) -> str: + """Format storage size with appropriate units and spacing""" + if size_gb < 0.001: + return "0 B" + elif size_gb < 1.0: + size_mb = size_gb * 1024 + return f"{size_mb:.0f} MB" + elif size_gb < 1024: + return f"{size_gb:.1f} GB" + else: + size_tb = size_gb / 1024 + return f"{size_tb:.2f} TB" + + +@dataclass +class TableDistribution: + """Represents shard distribution for a single table""" + + schema_name: str + table_name: str + total_primary_size_gb: float + node_distributions: Dict[str, Dict[str, Any]] # node_name -> metrics + + @property + def full_table_name(self) -> str: + return f"{self.schema_name}.{self.table_name}" if self.schema_name != "doc" else self.table_name + + +@dataclass +class DistributionAnomaly: + """Represents a detected distribution anomaly""" + + table: TableDistribution + anomaly_type: str + severity_score: float + impact_score: float + combined_score: float + description: str + details: Dict[str, Any] + recommendations: List[str] + + +class DistributionAnalyzer: + """Analyzes shard distribution across cluster nodes""" + + def __init__(self, client: CrateDBClient): + self.client = client + self.console = Console() + + def find_table_by_name(self, table_name: str) -> Optional[str]: + """Find table by name and resolve schema ambiguity""" + + query = """ + SELECT DISTINCT schema_name, table_name + FROM sys.shards + WHERE table_name = ? + AND schema_name NOT IN ('sys', 'information_schema', 'pg_catalog') + AND routing_state = 'STARTED' + ORDER BY schema_name \ + """ + + result = self.client.execute_query(query, [table_name]) + rows = result.get("rows", []) + + if not rows: + return None + elif len(rows) == 1: + schema, table = rows[0] + return f"{schema}.{table}" if schema != "doc" else table + else: + # Multiple schemas have this table - ask user + rprint(f"[yellow]Multiple schemas contain table '{table_name}':[/yellow]") + for i, (schema, table) in enumerate(rows, 1): + full_name = f"{schema}.{table}" if schema != "doc" else table + rprint(f" {i}. {full_name}") + + try: + choice = input("\nSelect table (enter number): ").strip() + idx = int(choice) - 1 + if 0 <= idx < len(rows): + schema, table = rows[idx] + return f"{schema}.{table}" if schema != "doc" else table + else: + rprint("[red]Invalid selection[/red]") + return None + except (ValueError, KeyboardInterrupt): + rprint("\n[yellow]Selection cancelled[/yellow]") + return None + + def get_table_distribution_detailed(self, table_identifier: str) -> Optional[TableDistribution]: + """Get detailed distribution data for a specific table""" + + # Parse schema and table name + if "." in table_identifier: + schema_name, table_name = table_identifier.split(".", 1) + else: + schema_name = "doc" + table_name = table_identifier + + query = """ + SELECT s.schema_name, \ + s.table_name, \ + s.node['name'] as node_name, \ + COUNT(CASE WHEN s."primary" = true THEN 1 END) as primary_shards, \ + COUNT(CASE WHEN s."primary" = false THEN 1 END) as replica_shards, \ + COUNT(*) as total_shards, \ + ROUND(SUM(s.size) / 1024.0 / 1024.0 / 1024.0, 2) as total_size_gb, \ + ROUND(SUM(CASE WHEN s."primary" = true THEN s.size ELSE 0 END) / 1024.0 / 1024.0 / 1024.0, \ + 2) as primary_size_gb, \ + ROUND(SUM(CASE WHEN s."primary" = false THEN s.size ELSE 0 END) / 1024.0 / 1024.0 / 1024.0, \ + 2) as replica_size_gb, \ + SUM(s.num_docs) as total_documents + FROM sys.shards s + WHERE s.schema_name = ? \ + AND s.table_name = ? + AND s.routing_state = 'STARTED' + GROUP BY s.schema_name, s.table_name, s.node['name'] + ORDER BY s.node['name'] \ + """ + + result = self.client.execute_query(query, [schema_name, table_name]) + rows = result.get("rows", []) + + if not rows: + return None + + # Build node distributions + node_distributions = {} + for row in rows: + node_distributions[row[2]] = { + "primary_shards": row[3], + "replica_shards": row[4], + "total_shards": row[5], + "total_size_gb": row[6], + "primary_size_gb": row[7], + "replica_size_gb": row[8], + "total_documents": row[9], + } + + # Calculate total primary size + total_primary_size = sum(node["primary_size_gb"] for node in node_distributions.values()) + + return TableDistribution( + schema_name=rows[0][0], + table_name=rows[0][1], + total_primary_size_gb=total_primary_size, + node_distributions=node_distributions, + ) + + def format_table_health_report(self, table_dist: TableDistribution) -> None: + """Format and display comprehensive table health report""" + + rprint(f"\n[bold blue]πŸ“‹ Table Health Report: {table_dist.full_table_name}[/bold blue]") + rprint("=" * 80) + + # Calculate overview stats + all_nodes_info = self.client.get_nodes_info() + cluster_nodes = {node.name for node in all_nodes_info if node.name} + table_nodes = set(table_dist.node_distributions.keys()) + missing_nodes = cluster_nodes - table_nodes + + total_shards = sum(node["total_shards"] for node in table_dist.node_distributions.values()) + total_primary_shards = sum(node["primary_shards"] for node in table_dist.node_distributions.values()) + total_replica_shards = sum(node["replica_shards"] for node in table_dist.node_distributions.values()) + total_size_gb = sum(node["total_size_gb"] for node in table_dist.node_distributions.values()) + total_documents = sum(node["total_documents"] for node in table_dist.node_distributions.values()) + + # Table Overview + rprint("\n[bold]🎯 Overview[/bold]") + rprint(f"β€’ Primary Data Size: {format_storage_size(table_dist.total_primary_size_gb)}") + rprint(f"β€’ Total Size (with replicas): {format_storage_size(total_size_gb)}") + rprint(f"β€’ Total Shards: {total_shards} ({total_primary_shards} primary + {total_replica_shards} replica)") + rprint(f"β€’ Total Documents: {total_documents:,}") + rprint( + f"β€’ Node Coverage: {len(table_nodes)}/{len(cluster_nodes)} nodes ({len(table_nodes) / len(cluster_nodes) * 100:.0f}%)" + ) + + if missing_nodes: + rprint(f"β€’ [yellow]Missing from nodes: {', '.join(sorted(missing_nodes))}[/yellow]") + + # Shard Distribution Table + rprint("\n[bold]πŸ“Š Shard Distribution by Node[/bold]") + + shard_table = Table(show_header=True) + shard_table.add_column("Node", width=15) + shard_table.add_column("Primary", width=8, justify="right") + shard_table.add_column("Replica", width=8, justify="right") + shard_table.add_column("Total", width=8, justify="right") + shard_table.add_column("Primary Size", width=12, justify="right") + shard_table.add_column("Replica Size", width=12, justify="right") + shard_table.add_column("Total Size", width=12, justify="right") + shard_table.add_column("Documents", width=12, justify="right") + + for node_name in sorted(table_dist.node_distributions.keys()): + node_data = table_dist.node_distributions[node_name] + + # Color coding based on shard count compared to average + avg_total_shards = total_shards / len(table_dist.node_distributions) + if node_data["total_shards"] > avg_total_shards * 1.5: + node_color = "red" + elif node_data["total_shards"] < avg_total_shards * 0.5: + node_color = "yellow" + else: + node_color = "white" + + shard_table.add_row( + f"[{node_color}]{node_name}[/{node_color}]", + str(node_data["primary_shards"]), + str(node_data["replica_shards"]), + f"[{node_color}]{node_data['total_shards']}[/{node_color}]", + format_storage_size(node_data["primary_size_gb"]), + format_storage_size(node_data["replica_size_gb"]), + f"[{node_color}]{format_storage_size(node_data['total_size_gb'])}[/{node_color}]", + f"{node_data['total_documents']:,}", + ) + + self.console.print(shard_table) + + # Distribution Analysis + rprint("\n[bold]πŸ” Distribution Analysis[/bold]") + + # Calculate statistics + shard_counts = [node["total_shards"] for node in table_dist.node_distributions.values()] + storage_sizes = [node["total_size_gb"] for node in table_dist.node_distributions.values()] + doc_counts = [node["total_documents"] for node in table_dist.node_distributions.values()] + + shard_cv = self.calculate_coefficient_of_variation(shard_counts) + storage_cv = self.calculate_coefficient_of_variation(storage_sizes) + doc_cv = self.calculate_coefficient_of_variation(doc_counts) + + min_shards, max_shards = min(shard_counts), max(shard_counts) + min_storage, max_storage = min(storage_sizes), max(storage_sizes) + min_docs, max_docs = min(doc_counts), max(doc_counts) + + # Shard distribution analysis + if shard_cv > 0.3: + rprint( + f"β€’ [red]⚠ Shard Imbalance:[/red] Range {min_shards}-{max_shards} shards per node (CV: {shard_cv:.2f})" + ) + else: + rprint(f"β€’ [green]βœ“ Shard Balance:[/green] Well distributed (CV: {shard_cv:.2f})") + + # Storage distribution analysis + if storage_cv > 0.4: + rprint( + f"β€’ [red]⚠ Storage Imbalance:[/red] Range {format_storage_size(min_storage)}-{format_storage_size(max_storage)} per node (CV: {storage_cv:.2f})" + ) + else: + rprint(f"β€’ [green]βœ“ Storage Balance:[/green] Well distributed (CV: {storage_cv:.2f})") + + # Document distribution analysis + if doc_cv > 0.5: + rprint(f"β€’ [red]⚠ Document Skew:[/red] Range {min_docs:,}-{max_docs:,} docs per node (CV: {doc_cv:.2f})") + else: + rprint(f"β€’ [green]βœ“ Document Distribution:[/green] Well balanced (CV: {doc_cv:.2f})") + + # Node coverage analysis + coverage_ratio = len(table_nodes) / len(cluster_nodes) + if coverage_ratio < 0.7: + missing_list = ", ".join(sorted(missing_nodes)[:5]) # Show up to 5 nodes + if len(missing_nodes) > 5: + missing_list += f", +{len(missing_nodes) - 5} more" + rprint(f"β€’ [red]⚠ Limited Coverage:[/red] {coverage_ratio:.0%} cluster coverage, missing: {missing_list}") + else: + rprint(f"β€’ [green]βœ“ Good Coverage:[/green] {coverage_ratio:.0%} of cluster nodes have this table") + + # Zone analysis if available + try: + zone_distribution = {} + for node_name, node_data in table_dist.node_distributions.items(): + # Try to get zone info for each node + node_info = next((n for n in all_nodes_info if n.name == node_name), None) + if ( + node_info + and hasattr(node_info, "attributes") + and node_info.attributes + and "zone" in node_info.attributes + ): + zone = node_info.attributes["zone"] + if zone not in zone_distribution: + zone_distribution[zone] = {"nodes": 0, "shards": 0, "size": 0} + zone_distribution[zone]["nodes"] += 1 + zone_distribution[zone]["shards"] += node_data["total_shards"] + zone_distribution[zone]["size"] += node_data["total_size_gb"] + + if zone_distribution: + rprint("\n[bold]🌍 Zone Distribution[/bold]") + for zone in sorted(zone_distribution.keys()): + zone_data = zone_distribution[zone] + rprint( + f"β€’ {zone}: {zone_data['nodes']} nodes, {zone_data['shards']} shards, {format_storage_size(zone_data['size'])}" + ) + + except Exception: + pass # Zone info not available + + # Health Summary + rprint("\n[bold]πŸ’Š Health Summary[/bold]") + issues = [] + recommendations = [] + + if shard_cv > 0.3: + issues.append("Shard imbalance") + recommendations.append("Consider moving shards between nodes for better distribution") + + if storage_cv > 0.4: + issues.append("Storage imbalance") + recommendations.append("Rebalance shards to distribute storage more evenly") + + if doc_cv > 0.5: + issues.append("Document skew") + recommendations.append("Review routing configuration - data may not be evenly distributed") + + if coverage_ratio < 0.7: + issues.append("Limited node coverage") + recommendations.append("Consider adding replicas to improve availability and distribution") + + if not issues: + rprint("β€’ [green]βœ… Table appears healthy with good distribution[/green]") + else: + rprint(f"β€’ [yellow]⚠ Issues found: {', '.join(issues)}[/yellow]") + rprint("\n[bold]πŸ’‘ Recommendations:[/bold]") + for rec in recommendations: + rprint(f" β€’ {rec}") + + rprint() + + def get_largest_tables_distribution(self, top_n: int = 10) -> List[TableDistribution]: + """Get distribution data for the largest tables using BIGDUDES query""" + + query = """ + WITH largest_tables AS (SELECT schema_name, \ + table_name, \ + SUM(CASE WHEN "primary" = true THEN size ELSE 0 END) as total_primary_size \ + FROM sys.shards \ + WHERE schema_name NOT IN ('sys', 'information_schema', 'pg_catalog') \ + AND routing_state = 'STARTED' \ + GROUP BY schema_name, table_name \ + ORDER BY total_primary_size DESC + LIMIT ? + ) + SELECT s.schema_name, \ + s.table_name, \ + s.node['name'] as node_name, \ + COUNT(CASE WHEN s."primary" = true THEN 1 END) as primary_shards, \ + COUNT(CASE WHEN s."primary" = false THEN 1 END) as replica_shards, \ + COUNT(*) as total_shards, \ + ROUND(SUM(s.size) / 1024.0 / 1024.0 / 1024.0, 2) as total_size_gb, \ + ROUND(SUM(CASE WHEN s."primary" = true THEN s.size ELSE 0 END) / 1024.0 / 1024.0 / 1024.0, \ + 2) as primary_size_gb, \ + ROUND(SUM(CASE WHEN s."primary" = false THEN s.size ELSE 0 END) / 1024.0 / 1024.0 / 1024.0, \ + 2) as replica_size_gb, \ + SUM(s.num_docs) as total_documents + FROM sys.shards s + INNER JOIN largest_tables lt \ + ON (s.schema_name = lt.schema_name AND s.table_name = lt.table_name) + WHERE s.routing_state = 'STARTED' + GROUP BY s.schema_name, s.table_name, s.node['name'] + ORDER BY s.schema_name, s.table_name, s.node['name'] \ + """ + + result = self.client.execute_query(query, [top_n]) + + # Extract rows from the result dictionary + rows = result.get("rows", []) + + if not rows: + return [] + + # Group results by table + tables_data = {} + for row in rows: + # Ensure we have enough columns + if len(row) < 10: + continue + + table_key = f"{row[0]}.{row[1]}" + if table_key not in tables_data: + tables_data[table_key] = {"schema_name": row[0], "table_name": row[1], "nodes": {}} + + tables_data[table_key]["nodes"][row[2]] = { + "primary_shards": row[3], + "replica_shards": row[4], + "total_shards": row[5], + "total_size_gb": row[6], + "primary_size_gb": row[7], + "replica_size_gb": row[8], + "total_documents": row[9], + } + + # Calculate total primary sizes and create TableDistribution objects + distributions = [] + for table_data in tables_data.values(): + total_primary_size = sum(node["primary_size_gb"] for node in table_data["nodes"].values()) + + distribution = TableDistribution( + schema_name=table_data["schema_name"], + table_name=table_data["table_name"], + total_primary_size_gb=total_primary_size, + node_distributions=table_data["nodes"], + ) + distributions.append(distribution) + + # Sort by primary size (descending) + return sorted(distributions, key=lambda x: x.total_primary_size_gb, reverse=True) + + def calculate_coefficient_of_variation(self, values: List[float]) -> float: + """Calculate coefficient of variation (std dev / mean)""" + if not values or len(values) < 2: + return 0.0 + + mean_val = statistics.mean(values) + if mean_val == 0: + return 0.0 + + try: + std_dev = statistics.stdev(values) + return std_dev / mean_val + except statistics.StatisticsError: + return 0.0 + + def detect_shard_count_imbalance(self, table: TableDistribution) -> Optional[DistributionAnomaly]: + """Detect imbalances in shard count distribution""" + if not table.node_distributions: + return None + + # Get shard counts per node + total_shards = [node["total_shards"] for node in table.node_distributions.values()] + primary_shards = [node["primary_shards"] for node in table.node_distributions.values()] + replica_shards = [node["replica_shards"] for node in table.node_distributions.values()] + + # Calculate coefficient of variation + total_cv = self.calculate_coefficient_of_variation(total_shards) + primary_cv = self.calculate_coefficient_of_variation(primary_shards) + replica_cv = self.calculate_coefficient_of_variation(replica_shards) + + # Severity based on highest CV (higher CV = more imbalanced) + max_cv = max(total_cv, primary_cv, replica_cv) + + # Consider it an anomaly if CV > 0.3 (30% variation) + if max_cv < 0.3: + return None + + # Impact based on table size + impact_score = min(table.total_primary_size_gb / 100.0, 10.0) # Cap at 10 + severity_score = min(max_cv * 10, 10.0) # Scale to 0-10 + combined_score = impact_score * severity_score + + # Generate recommendations + recommendations = [] + min_shards = min(total_shards) + max_shards = max(total_shards) + + if max_shards - min_shards > 1: + overloaded_nodes = [ + node for node, data in table.node_distributions.items() if data["total_shards"] == max_shards + ] + underloaded_nodes = [ + node for node, data in table.node_distributions.items() if data["total_shards"] == min_shards + ] + + if overloaded_nodes and underloaded_nodes: + recommendations.append(f"Move shards from {overloaded_nodes[0]} to {underloaded_nodes[0]}") + + return DistributionAnomaly( + table=table, + anomaly_type="Shard Count Imbalance", + severity_score=severity_score, + impact_score=impact_score, + combined_score=combined_score, + description=f"Uneven shard distribution (CV: {max_cv:.2f})", + details={ + "total_cv": total_cv, + "primary_cv": primary_cv, + "replica_cv": replica_cv, + "shard_counts": {node: data["total_shards"] for node, data in table.node_distributions.items()}, + }, + recommendations=recommendations, + ) + + def detect_storage_imbalance(self, table: TableDistribution) -> Optional[DistributionAnomaly]: + """Detect imbalances in storage distribution""" + if not table.node_distributions: + return None + + storage_sizes = [node["total_size_gb"] for node in table.node_distributions.values()] + + # Skip if all sizes are very small (< 1GB total) + if sum(storage_sizes) < 1.0: + return None + + cv = self.calculate_coefficient_of_variation(storage_sizes) + + # Consider it an anomaly if CV > 0.4 (40% variation) for storage + if cv < 0.4: + return None + + impact_score = min(table.total_primary_size_gb / 50.0, 10.0) + severity_score = min(cv * 8, 10.0) + combined_score = impact_score * severity_score + + # Generate recommendations + recommendations = [] + min_size = min(storage_sizes) + max_size = max(storage_sizes) + + if max_size > min_size * 2: # If difference is > 2x + overloaded_node = None + underloaded_node = None + + for node, data in table.node_distributions.items(): + if data["total_size_gb"] == max_size: + overloaded_node = node + elif data["total_size_gb"] == min_size: + underloaded_node = node + + if overloaded_node and underloaded_node: + recommendations.append( + f"Rebalance storage from {overloaded_node} ({format_storage_size(max_size)}) to {underloaded_node} ({format_storage_size(min_size)})" + ) + + return DistributionAnomaly( + table=table, + anomaly_type="Storage Imbalance", + severity_score=severity_score, + impact_score=impact_score, + combined_score=combined_score, + description=f"Uneven storage distribution (CV: {cv:.2f})", + details={ + "storage_cv": cv, + "storage_sizes": {node: data["total_size_gb"] for node, data in table.node_distributions.items()}, + }, + recommendations=recommendations, + ) + + def detect_node_coverage_issues(self, table: TableDistribution) -> Optional[DistributionAnomaly]: + """Detect nodes with missing shard coverage""" + if not table.node_distributions: + return None + + # Get all cluster nodes + all_nodes = set() + try: + nodes_info = self.client.get_nodes_info() + all_nodes = {node.name for node in nodes_info if node.name} + except Exception: + # If we can't get node info, use nodes that have shards + all_nodes = set(table.node_distributions.keys()) + + nodes_with_shards = set(table.node_distributions.keys()) + nodes_without_shards = all_nodes - nodes_with_shards + + # Only flag as anomaly if we have missing nodes and the table is significant + if not nodes_without_shards or table.total_primary_size_gb < 10.0: + return None + + coverage_ratio = len(nodes_with_shards) / len(all_nodes) + + # Consider it an anomaly if coverage < 70% + if coverage_ratio >= 0.7: + return None + + impact_score = min(table.total_primary_size_gb / 100.0, 10.0) + severity_score = (1 - coverage_ratio) * 10 # Higher severity for lower coverage + combined_score = impact_score * severity_score + + recommendations = [f"Consider adding replicas to nodes: {', '.join(sorted(nodes_without_shards))}"] + + return DistributionAnomaly( + table=table, + anomaly_type="Node Coverage Issue", + severity_score=severity_score, + impact_score=impact_score, + combined_score=combined_score, + description=f"Limited node coverage ({len(nodes_with_shards)}/{len(all_nodes)} nodes)", + details={ + "coverage_ratio": coverage_ratio, + "nodes_with_shards": sorted(nodes_with_shards), + "nodes_without_shards": sorted(nodes_without_shards), + }, + recommendations=recommendations, + ) + + def detect_document_imbalance(self, table: TableDistribution) -> Optional[DistributionAnomaly]: + """Detect imbalances in document distribution""" + if not table.node_distributions: + return None + + document_counts = [node["total_documents"] for node in table.node_distributions.values()] + + # Skip if total documents is very low + if sum(document_counts) < 10000: + return None + + cv = self.calculate_coefficient_of_variation(document_counts) + + # Consider it an anomaly if CV > 0.5 (50% variation) for documents + if cv < 0.5: + return None + + impact_score = min(table.total_primary_size_gb / 100.0, 10.0) + severity_score = min(cv * 6, 10.0) + combined_score = impact_score * severity_score + + # Generate recommendations + recommendations = ["Document imbalance may indicate data skew - consider reviewing shard routing"] + + min_docs = min(document_counts) + max_docs = max(document_counts) + + if max_docs > min_docs * 3: # If difference is > 3x + recommendations.append(f"Significant document skew detected ({min_docs:,} to {max_docs:,} docs per node)") + + return DistributionAnomaly( + table=table, + anomaly_type="Document Imbalance", + severity_score=severity_score, + impact_score=impact_score, + combined_score=combined_score, + description=f"Uneven document distribution (CV: {cv:.2f})", + details={ + "document_cv": cv, + "document_counts": {node: data["total_documents"] for node, data in table.node_distributions.items()}, + }, + recommendations=recommendations, + ) + + def analyze_distribution(self, top_tables: int = 10) -> List[DistributionAnomaly]: + """Analyze shard distribution and return ranked anomalies""" + + # Get table distributions + distributions = self.get_largest_tables_distribution(top_tables) + + # Detect all anomalies + anomalies = [] + + for table_dist in distributions: + # Check each type of anomaly + for detector in [ + self.detect_shard_count_imbalance, + self.detect_storage_imbalance, + self.detect_node_coverage_issues, + self.detect_document_imbalance, + ]: + anomaly = detector(table_dist) + if anomaly: + anomalies.append(anomaly) + + # Sort by combined score (highest first) + return sorted(anomalies, key=lambda x: x.combined_score, reverse=True), len(distributions) + + def format_distribution_report(self, anomalies: List[DistributionAnomaly], tables_analyzed: int) -> None: + """Format and display the distribution analysis report""" + + if not anomalies: + rprint( + f"[green]βœ“ No significant shard distribution anomalies detected in top {tables_analyzed} tables![/green]" + ) + return + + # Show analysis scope + unique_tables = set(anomaly.table.full_table_name for anomaly in anomalies) + rprint( + f"[blue]πŸ“‹ Analyzed {tables_analyzed} largest tables, found issues in {len(unique_tables)} tables[/blue]" + ) + rprint() + + # Summary table + table = Table(title="🎯 Shard Distribution Anomalies", show_header=True) + table.add_column("Rank", width=4) + table.add_column("Table", min_width=20) + table.add_column("Issue Type", min_width=15) + table.add_column("Score", width=8) + table.add_column("Primary Size", width=12) + table.add_column("Description", min_width=25) + + for i, anomaly in enumerate(anomalies[:10], 1): # Top 10 + # Color coding by severity + if anomaly.combined_score >= 50: + rank_color = "red" + elif anomaly.combined_score >= 25: + rank_color = "yellow" + else: + rank_color = "blue" + + table.add_row( + f"[{rank_color}]{i}[/{rank_color}]", + anomaly.table.full_table_name, + anomaly.anomaly_type, + f"[{rank_color}]{anomaly.combined_score:.1f}[/{rank_color}]", + format_storage_size(anomaly.table.total_primary_size_gb), + anomaly.description, + ) + + self.console.print(table) + + # Detailed recommendations for top issues + if anomalies: + rprint("\n[bold]πŸ”§ Top Recommendations:[/bold]") + + for i, anomaly in enumerate(anomalies[:5], 1): # Top 5 recommendations + rprint(f"\n[bold]{i}. {anomaly.table.full_table_name}[/bold] - {anomaly.anomaly_type}") + + # Show the problem analysis first + rprint(f" [yellow]πŸ” Problem:[/yellow] {anomaly.description}") + + # Add specific details about what's wrong + if anomaly.anomaly_type == "Shard Count Imbalance": + if "shard_counts" in anomaly.details: + counts = anomaly.details["shard_counts"] + min_count = min(counts.values()) + max_count = max(counts.values()) + overloaded = [node for node, count in counts.items() if count == max_count] + underloaded = [node for node, count in counts.items() if count == min_count] + rprint( + f" [red]⚠ Issue:[/red] {overloaded[0]} has {max_count} shards while {underloaded[0]} has only {min_count} shards" + ) + + elif anomaly.anomaly_type == "Storage Imbalance": + if "storage_sizes" in anomaly.details: + sizes = anomaly.details["storage_sizes"] + min_size = min(sizes.values()) + max_size = max(sizes.values()) + overloaded = [node for node, size in sizes.items() if size == max_size][0] + underloaded = [node for node, size in sizes.items() if size == min_size][0] + rprint( + f" [red]⚠ Issue:[/red] Storage ranges from {format_storage_size(min_size)} ({underloaded}) to {format_storage_size(max_size)} ({overloaded}) - {max_size / min_size:.1f}x difference" + ) + + elif anomaly.anomaly_type == "Node Coverage Issue": + if "nodes_without_shards" in anomaly.details: + missing_nodes = anomaly.details["nodes_without_shards"] + coverage_ratio = anomaly.details["coverage_ratio"] + rprint( + f" [red]⚠ Issue:[/red] Table missing from {len(missing_nodes)} nodes ({coverage_ratio:.0%} cluster coverage)" + ) + rprint( + f" [dim] Missing from: {', '.join(missing_nodes[:3])}{'...' if len(missing_nodes) > 3 else ''}[/dim]" + ) + + elif anomaly.anomaly_type == "Document Imbalance": + if "document_counts" in anomaly.details: + doc_counts = anomaly.details["document_counts"] + min_docs = min(doc_counts.values()) + max_docs = max(doc_counts.values()) + ratio = max_docs / min_docs if min_docs > 0 else float("inf") + rprint( + f" [red]⚠ Issue:[/red] Document counts range from {min_docs:,} to {max_docs:,} ({ratio:.1f}x difference)" + ) + + # Show recommendations + rprint(" [green]πŸ’‘ Solutions:[/green]") + for rec in anomaly.recommendations: + rprint(f" β€’ {rec}") + + # Summary statistics + unique_tables = set(anomaly.table.full_table_name for anomaly in anomalies) + rprint("\n[dim]πŸ“Š Analysis Summary:[/dim]") + rprint(f"[dim]β€’ Tables analyzed: {tables_analyzed}[/dim]") + rprint(f"[dim]β€’ Tables with issues: {len(unique_tables)}[/dim]") + rprint(f"[dim]β€’ Total anomalies found: {len(anomalies)}[/dim]") + rprint(f"[dim]β€’ Critical issues (score >50): {len([a for a in anomalies if a.combined_score >= 50])}[/dim]") + rprint( + f"[dim]β€’ Warning issues (score 25-50): {len([a for a in anomalies if 25 <= a.combined_score < 50])}[/dim]" + ) diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index 339f9e7f..e5e6e834 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -11,6 +11,7 @@ from rich.console import Console from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer, ShardReporter +from cratedb_toolkit.admin.xmover.analysis.table import DistributionAnalyzer from cratedb_toolkit.admin.xmover.analysis.zone import ZoneReport from cratedb_toolkit.admin.xmover.model import ( ShardRelocationConstraints, @@ -185,6 +186,69 @@ def check_balance(ctx, table: Optional[str], tolerance: float): report.shard_balance(tolerance=tolerance, table=table) +@main.command() +@click.option("--top-tables", default=10, help="Number of largest tables to analyze (default: 10)") +@click.option("--table", help='Analyze specific table only (e.g., "my_table" or "schema.table")') +@click.pass_context +def shard_distribution(ctx, top_tables: int, table: Optional[str]): + """Analyze shard distribution anomalies across cluster nodes + + This command analyzes the largest tables in your cluster to detect: + β€’ Uneven shard count distribution between nodes + β€’ Storage imbalances across nodes + β€’ Missing node coverage for tables + β€’ Document count imbalances indicating data skew + + Results are ranked by impact and severity to help prioritize fixes. + + Examples: + xmover shard-distribution # Analyze top 10 tables + xmover shard-distribution --top-tables 20 # Analyze top 20 tables + xmover shard-distribution --table my_table # Detailed report for specific table + """ + try: + client = ctx.obj["client"] + analyzer = DistributionAnalyzer(client) + + if table: + # Focused table analysis mode + console.print(f"[blue]πŸ” Analyzing table: {table}...[/blue]") + + # Find table (handles schema auto-detection) + table_identifier = analyzer.find_table_by_name(table) + if not table_identifier: + console.print(f"[red]❌ Table '{table}' not found[/red]") + return + + # Get detailed distribution + table_dist = analyzer.get_table_distribution_detailed(table_identifier) + if not table_dist: + console.print(f"[red]❌ No shard data found for table '{table_identifier}'[/red]") + return + + # Display comprehensive health report + analyzer.format_table_health_report(table_dist) + + else: + # General anomaly detection mode + console.print(f"[blue]πŸ” Analyzing shard distribution for top {top_tables} tables...[/blue]") + console.print() + + # Perform analysis + anomalies, tables_analyzed = analyzer.analyze_distribution(top_tables) + + # Display results + analyzer.format_distribution_report(anomalies, tables_analyzed) + + except KeyboardInterrupt: + console.print("\n[yellow]Analysis interrupted by user[/yellow]") + except Exception as e: + console.print(f"[red]Error during distribution analysis: {e}[/red]") + import traceback + + console.print(f"[dim]{traceback.format_exc()}[/dim]") + + @main.command() @click.option("--table", "-t", help="Analyze zones for specific table only") @click.option("--show-shards/--no-show-shards", default=False, help="Show individual shard details (default: False)") diff --git a/doc/admin/xmover/handbook.md b/doc/admin/xmover/handbook.md index cf9b4abe..05a3c57a 100644 --- a/doc/admin/xmover/handbook.md +++ b/doc/admin/xmover/handbook.md @@ -56,6 +56,19 @@ xmover recommend --execute xmover recommend --prioritize-space ``` +### Shard Distribution Analysis +This view is dedicating a specific focus on large tables. +```bash +# Analyze distribution anomalies for top 10 largest tables +xmover shard-distribution + +# Analyze more tables +xmover shard-distribution --top-tables 20 + +# Detailed health report for specific table +xmover shard-distribution --table my_table +``` + ### Zone Analysis ```bash # Check zone balance diff --git a/doc/admin/xmover/index.md b/doc/admin/xmover/index.md index affa4825..99fd4404 100644 --- a/doc/admin/xmover/index.md +++ b/doc/admin/xmover/index.md @@ -11,6 +11,7 @@ SQL commands for shard rebalancing and node decommissioning. ## Features - **Cluster Analysis**: Complete overview of shard distribution across nodes and zones +- **Shard Distribution Analysis**: Detect and rank distribution anomalies across largest tables - **Shard Movement Recommendations**: Intelligent suggestions for rebalancing with safety validation - **Recovery Monitoring**: Track ongoing shard recovery operations with progress details - **Zone Conflict Detection**: Prevents moves that would violate CrateDB's zone awareness diff --git a/doc/admin/xmover/queries.md b/doc/admin/xmover/queries.md index 27bd89e6..9844d8f6 100644 --- a/doc/admin/xmover/queries.md +++ b/doc/admin/xmover/queries.md @@ -216,3 +216,34 @@ SELECT WHERE current_state != 'STARTED' and table_name = 'dispatchio' and shard_id = 19 ORDER BY current_state, table_name, shard_id; ``` + +## "BIGDUDES" Focuses on your **biggest storage consumers** and shows how their shards are distributed across nodes. + +´´´sql +WITH largest_tables AS ( + SELECT + schema_name, + table_name, + SUM(CASE WHEN "primary" = true THEN size ELSE 0 END) as total_primary_size + FROM sys.shards + WHERE schema_name NOT IN ('sys', 'information_schema', 'pg_catalog') + GROUP BY schema_name, table_name + ORDER BY total_primary_size DESC + LIMIT 10 + ) + SELECT + s.schema_name, + s.table_name, + s.node['name'] as node_name, + COUNT(CASE WHEN s."primary" = true THEN 1 END) as primary_shards, + COUNT(CASE WHEN s."primary" = false THEN 1 END) as replica_shards, + COUNT(*) as total_shards, + ROUND(SUM(s.size) / 1024.0 / 1024.0 / 1024.0, 2) as total_size_gb, + ROUND(SUM(CASE WHEN s."primary" = true THEN s.size ELSE 0 END) / 1024.0 / 1024.0 / 1024.0, 2) as primary_size_gb, + ROUND(SUM(CASE WHEN s."primary" = false THEN s.size ELSE 0 END) / 1024.0 / 1024.0 / 1024.0, 2) as replica_size_gb, + SUM(s.num_docs) as total_documents + FROM sys.shards s + INNER JOIN largest_tables lt ON (s.schema_name = lt.schema_name AND s.table_name = lt.table_name) + GROUP BY s.schema_name, s.table_name, s.node['name'] + ORDER BY s.schema_name, s.table_name, s.node['name']; +``` diff --git a/tests/admin/test_cli.py b/tests/admin/test_cli.py index 60e8d810..de3e4624 100644 --- a/tests/admin/test_cli.py +++ b/tests/admin/test_cli.py @@ -15,6 +15,7 @@ "recommend", "test-connection", "zone-analysis", + "shard-distribution", ], ) def test_xmover_all(cratedb, subcommand): From 5068671cd16755c0e5bb87657959f3f5ff5b61ab Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 21 Aug 2025 14:13:11 +0200 Subject: [PATCH 17/18] Admin/XMover: Code formatting. Linting. Type checking. - More or less just line-length fixes. - Only a single type adjustment was needed on the return value of the `analyze_distribution` method. - Ruff recommended to use set comprehensions, so here we go. - At a single spot where an exception has been `pass`ed, we added error output. Is it bad? --- .../admin/xmover/analysis/table.py | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/cratedb_toolkit/admin/xmover/analysis/table.py b/cratedb_toolkit/admin/xmover/analysis/table.py index b8f1a7ce..ef6dbdf3 100644 --- a/cratedb_toolkit/admin/xmover/analysis/table.py +++ b/cratedb_toolkit/admin/xmover/analysis/table.py @@ -5,6 +5,7 @@ and provide recommendations for optimization. """ +import logging import statistics from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple @@ -15,6 +16,8 @@ from cratedb_toolkit.admin.xmover.util.database import CrateDBClient +logger = logging.getLogger(__name__) + def format_storage_size(size_gb: float) -> str: """Format storage size with appropriate units and spacing""" @@ -134,7 +137,7 @@ def get_table_distribution_detailed(self, table_identifier: str) -> Optional[Tab AND s.routing_state = 'STARTED' GROUP BY s.schema_name, s.table_name, s.node['name'] ORDER BY s.node['name'] \ - """ + """ # noqa: E501 result = self.client.execute_query(query, [schema_name, table_name]) rows = result.get("rows", []) @@ -190,7 +193,8 @@ def format_table_health_report(self, table_dist: TableDistribution) -> None: rprint(f"β€’ Total Shards: {total_shards} ({total_primary_shards} primary + {total_replica_shards} replica)") rprint(f"β€’ Total Documents: {total_documents:,}") rprint( - f"β€’ Node Coverage: {len(table_nodes)}/{len(cluster_nodes)} nodes ({len(table_nodes) / len(cluster_nodes) * 100:.0f}%)" + f"β€’ Node Coverage: {len(table_nodes)}/{len(cluster_nodes)} nodes " + f"({len(table_nodes) / len(cluster_nodes) * 100:.0f}%)" ) if missing_nodes: @@ -261,7 +265,8 @@ def format_table_health_report(self, table_dist: TableDistribution) -> None: # Storage distribution analysis if storage_cv > 0.4: rprint( - f"β€’ [red]⚠ Storage Imbalance:[/red] Range {format_storage_size(min_storage)}-{format_storage_size(max_storage)} per node (CV: {storage_cv:.2f})" + f"β€’ [red]⚠ Storage Imbalance:[/red] Range " + f"{format_storage_size(min_storage)}-{format_storage_size(max_storage)} per node (CV: {storage_cv:.2f})" ) else: rprint(f"β€’ [green]βœ“ Storage Balance:[/green] Well distributed (CV: {storage_cv:.2f})") @@ -306,11 +311,13 @@ def format_table_health_report(self, table_dist: TableDistribution) -> None: for zone in sorted(zone_distribution.keys()): zone_data = zone_distribution[zone] rprint( - f"β€’ {zone}: {zone_data['nodes']} nodes, {zone_data['shards']} shards, {format_storage_size(zone_data['size'])}" + f"β€’ {zone}: {zone_data['nodes']} nodes, " + f"{zone_data['shards']} shards, {format_storage_size(zone_data['size'])}" ) except Exception: - pass # Zone info not available + # Zone info not available + logger.exception("Zone info not available") # Health Summary rprint("\n[bold]πŸ’Š Health Summary[/bold]") @@ -375,7 +382,7 @@ def get_largest_tables_distribution(self, top_n: int = 10) -> List[TableDistribu WHERE s.routing_state = 'STARTED' GROUP BY s.schema_name, s.table_name, s.node['name'] ORDER BY s.schema_name, s.table_name, s.node['name'] \ - """ + """ # noqa: E501 result = self.client.execute_query(query, [top_n]) @@ -534,7 +541,8 @@ def detect_storage_imbalance(self, table: TableDistribution) -> Optional[Distrib if overloaded_node and underloaded_node: recommendations.append( - f"Rebalance storage from {overloaded_node} ({format_storage_size(max_size)}) to {underloaded_node} ({format_storage_size(min_size)})" + f"Rebalance storage from {overloaded_node} ({format_storage_size(max_size)}) " + f"to {underloaded_node} ({format_storage_size(min_size)})" ) return DistributionAnomaly( @@ -643,7 +651,7 @@ def detect_document_imbalance(self, table: TableDistribution) -> Optional[Distri recommendations=recommendations, ) - def analyze_distribution(self, top_tables: int = 10) -> List[DistributionAnomaly]: + def analyze_distribution(self, top_tables: int = 10) -> Tuple[List[DistributionAnomaly], int]: """Analyze shard distribution and return ranked anomalies""" # Get table distributions @@ -672,12 +680,13 @@ def format_distribution_report(self, anomalies: List[DistributionAnomaly], table if not anomalies: rprint( - f"[green]βœ“ No significant shard distribution anomalies detected in top {tables_analyzed} tables![/green]" + f"[green]βœ“ No significant shard distribution anomalies " + f"detected in top {tables_analyzed} tables![/green]" ) return # Show analysis scope - unique_tables = set(anomaly.table.full_table_name for anomaly in anomalies) + unique_tables = {anomaly.table.full_table_name for anomaly in anomalies} rprint( f"[blue]πŸ“‹ Analyzed {tables_analyzed} largest tables, found issues in {len(unique_tables)} tables[/blue]" ) @@ -731,7 +740,8 @@ def format_distribution_report(self, anomalies: List[DistributionAnomaly], table overloaded = [node for node, count in counts.items() if count == max_count] underloaded = [node for node, count in counts.items() if count == min_count] rprint( - f" [red]⚠ Issue:[/red] {overloaded[0]} has {max_count} shards while {underloaded[0]} has only {min_count} shards" + f" [red]⚠ Issue:[/red] {overloaded[0]} has {max_count} shards " + f"while {underloaded[0]} has only {min_count} shards" ) elif anomaly.anomaly_type == "Storage Imbalance": @@ -742,7 +752,8 @@ def format_distribution_report(self, anomalies: List[DistributionAnomaly], table overloaded = [node for node, size in sizes.items() if size == max_size][0] underloaded = [node for node, size in sizes.items() if size == min_size][0] rprint( - f" [red]⚠ Issue:[/red] Storage ranges from {format_storage_size(min_size)} ({underloaded}) to {format_storage_size(max_size)} ({overloaded}) - {max_size / min_size:.1f}x difference" + f" [red]⚠ Issue:[/red] Storage ranges from {format_storage_size(min_size)} ({underloaded}) " # noqa: E501 + f"to {format_storage_size(max_size)} ({overloaded}) - {max_size / min_size:.1f}x difference" ) elif anomaly.anomaly_type == "Node Coverage Issue": @@ -750,11 +761,11 @@ def format_distribution_report(self, anomalies: List[DistributionAnomaly], table missing_nodes = anomaly.details["nodes_without_shards"] coverage_ratio = anomaly.details["coverage_ratio"] rprint( - f" [red]⚠ Issue:[/red] Table missing from {len(missing_nodes)} nodes ({coverage_ratio:.0%} cluster coverage)" - ) - rprint( - f" [dim] Missing from: {', '.join(missing_nodes[:3])}{'...' if len(missing_nodes) > 3 else ''}[/dim]" + f" [red]⚠ Issue:[/red] Table missing from {len(missing_nodes)} nodes " + f"({coverage_ratio:.0%} cluster coverage)" ) + ellipsis = "..." if len(missing_nodes) > 3 else "" + rprint(f" [dim] Missing from: {', '.join(missing_nodes[:3])}{ellipsis}[/dim]") elif anomaly.anomaly_type == "Document Imbalance": if "document_counts" in anomaly.details: @@ -763,7 +774,8 @@ def format_distribution_report(self, anomalies: List[DistributionAnomaly], table max_docs = max(doc_counts.values()) ratio = max_docs / min_docs if min_docs > 0 else float("inf") rprint( - f" [red]⚠ Issue:[/red] Document counts range from {min_docs:,} to {max_docs:,} ({ratio:.1f}x difference)" + f" [red]⚠ Issue:[/red] Document counts range " + f"from {min_docs:,} to {max_docs:,} ({ratio:.1f}x difference)" ) # Show recommendations @@ -772,7 +784,7 @@ def format_distribution_report(self, anomalies: List[DistributionAnomaly], table rprint(f" β€’ {rec}") # Summary statistics - unique_tables = set(anomaly.table.full_table_name for anomaly in anomalies) + unique_tables = {anomaly.table.full_table_name for anomaly in anomalies} rprint("\n[dim]πŸ“Š Analysis Summary:[/dim]") rprint(f"[dim]β€’ Tables analyzed: {tables_analyzed}[/dim]") rprint(f"[dim]β€’ Tables with issues: {len(unique_tables)}[/dim]") From 256ff3d27d434ceeefb4647e4a68a995fdaf9c7c Mon Sep 17 00:00:00 2001 From: Walter Behmann Date: Fri, 5 Sep 2025 11:32:34 +0200 Subject: [PATCH 18/18] Admin/XMover: Add module for active shard monitoring --- .../admin/xmover/analysis/shard.py | 180 +++++++ cratedb_toolkit/admin/xmover/cli.py | 197 +++++++- cratedb_toolkit/admin/xmover/model.py | 64 +++ cratedb_toolkit/admin/xmover/util/database.py | 59 ++- doc/admin/xmover/handbook.md | 126 +++++ pyproject.toml | 1 + tests/admin/test_active_shard_monitor.py | 472 ++++++++++++++++++ tests/admin/test_distribution_analyzer.py | 294 +++++++++++ tests/admin/test_recovery_monitor.py | 296 +++++++++++ 9 files changed, 1687 insertions(+), 2 deletions(-) create mode 100644 tests/admin/test_active_shard_monitor.py create mode 100644 tests/admin/test_distribution_analyzer.py create mode 100644 tests/admin/test_recovery_monitor.py diff --git a/cratedb_toolkit/admin/xmover/analysis/shard.py b/cratedb_toolkit/admin/xmover/analysis/shard.py index f6f24b6b..a1869019 100644 --- a/cratedb_toolkit/admin/xmover/analysis/shard.py +++ b/cratedb_toolkit/admin/xmover/analysis/shard.py @@ -13,6 +13,8 @@ from rich.table import Table from cratedb_toolkit.admin.xmover.model import ( + ActiveShardActivity, + ActiveShardSnapshot, DistributionStats, NodeInfo, ShardInfo, @@ -947,3 +949,181 @@ def distribution(self, table: str = None): table_summary.add_row("Node Balance Score", f"{stats.node_balance_score:.1f}/100") console.print(table_summary) + + +class ActiveShardMonitor: + """Monitor active shard checkpoint progression over time""" + + def __init__(self, client: CrateDBClient): + self.client = client + + def compare_snapshots( + self, + snapshot1: List[ActiveShardSnapshot], + snapshot2: List[ActiveShardSnapshot], + min_activity_threshold: int = 0, + ) -> List["ActiveShardActivity"]: + """Compare two snapshots and return activity data for shards present in both + + Args: + snapshot1: First snapshot (baseline) + snapshot2: Second snapshot (comparison) + min_activity_threshold: Minimum checkpoint delta to consider active (default: 0) + """ + + # Create lookup dict for snapshot1 + snapshot1_dict = {snap.shard_identifier: snap for snap in snapshot1} + + activities = [] + + for snap2 in snapshot2: + snap1 = snapshot1_dict.get(snap2.shard_identifier) + if snap1: + # Calculate local checkpoint delta + local_checkpoint_delta = snap2.local_checkpoint - snap1.local_checkpoint + time_diff = snap2.timestamp - snap1.timestamp + + # Filter based on actual activity between snapshots + if local_checkpoint_delta >= min_activity_threshold: + activity = ActiveShardActivity( + schema_name=snap2.schema_name, + table_name=snap2.table_name, + shard_id=snap2.shard_id, + node_name=snap2.node_name, + is_primary=snap2.is_primary, + partition_ident=snap2.partition_ident, + local_checkpoint_delta=local_checkpoint_delta, + snapshot1=snap1, + snapshot2=snap2, + time_diff_seconds=time_diff, + ) + activities.append(activity) + + # Sort by activity (highest checkpoint delta first) + activities.sort(key=lambda x: x.local_checkpoint_delta, reverse=True) + + return activities + + def format_activity_display( + self, activities: List["ActiveShardActivity"], show_count: int = 10, watch_mode: bool = False + ) -> str: + """Format activity data for console display""" + if not activities: + return "βœ… No active shards with significant checkpoint progression found" + + # Limit to requested count + activities = activities[:show_count] + + # Calculate observation period for context + if activities: + observation_period = activities[0].time_diff_seconds + output = [ + f"\nπŸ”₯ Most Active Shards ({len(activities)} shown, {observation_period:.0f}s observation period)" + ] + else: + output = [f"\nπŸ”₯ Most Active Shards ({len(activities)} shown, sorted by checkpoint activity)"] + + output.append("") + + # Add activity rate context + if activities: + total_activity = sum(a.local_checkpoint_delta for a in activities) + avg_rate = sum(a.activity_rate for a in activities) / len(activities) + output.append( + f"[dim]Total checkpoint activity: {total_activity:,} changes, Average rate: {avg_rate:.1f}/sec[/dim]" + ) + output.append("") + + # Create table headers + headers = ["Rank", "Schema.Table", "Shard", "Partition", "Node", "Type", "Checkpoint Ξ”", "Rate/sec", "Trend"] + + # Calculate column widths + col_widths = [len(h) for h in headers] + + # Prepare rows + rows = [] + for i, activity in enumerate(activities, 1): + # Format values + rank = str(i) + table_id = activity.table_identifier + shard_id = str(activity.shard_id) + partition = ( + activity.partition_ident[:14] + "..." + if len(activity.partition_ident) > 14 + else activity.partition_ident or "-" + ) + node = activity.node_name + shard_type = "P" if activity.is_primary else "R" + checkpoint_delta = f"{activity.local_checkpoint_delta:,}" + rate = f"{activity.activity_rate:.1f}" if activity.activity_rate >= 0.1 else "<0.1" + + # Calculate activity trend indicator + if activity.activity_rate >= 100: + trend = "πŸ”₯ HOT" + elif activity.activity_rate >= 50: + trend = "πŸ“ˆ HIGH" + elif activity.activity_rate >= 10: + trend = "πŸ“Š MED" + else: + trend = "πŸ“‰ LOW" + + row = [rank, table_id, shard_id, partition, node, shard_type, checkpoint_delta, rate, trend] + rows.append(row) + + # Update column widths + for j, cell in enumerate(row): + col_widths[j] = max(col_widths[j], len(cell)) + + # Format table + header_row = " " + " | ".join(h.ljust(w) for h, w in zip(headers, col_widths)) + output.append(header_row) + output.append(" " + "-" * (len(header_row) - 3)) + + # Data rows + for row in rows: + data_row = " " + " | ".join(cell.ljust(w) for cell, w in zip(row, col_widths)) + output.append(data_row) + + # Only show legend and insights in non-watch mode + if not watch_mode: + output.append("") + output.append("Legend:") + output.append(" β€’ Checkpoint Ξ”: Write operations during observation period") + output.append(" β€’ Rate/sec: Checkpoint changes per second") + output.append(" β€’ Partition: partition_ident (truncated if >14 chars, '-' if none)") + output.append(" β€’ Type: P=Primary, R=Replica") + output.append(" β€’ Trend: πŸ”₯ HOT (β‰₯100/s), πŸ“ˆ HIGH (β‰₯50/s), πŸ“Š MED (β‰₯10/s), πŸ“‰ LOW (<10/s)") + + # Add insights about activity patterns + if activities: + output.append("") + output.append("Insights:") + + # Count by trend + hot_count = len([a for a in activities if a.activity_rate >= 100]) + high_count = len([a for a in activities if 50 <= a.activity_rate < 100]) + med_count = len([a for a in activities if 10 <= a.activity_rate < 50]) + low_count = len([a for a in activities if a.activity_rate < 10]) + + if hot_count > 0: + output.append(f" β€’ {hot_count} HOT shards (β‰₯100 changes/sec) - consider load balancing") + if high_count > 0: + output.append(f" β€’ {high_count} HIGH activity shards - monitor capacity") + if med_count > 0: + output.append(f" β€’ {med_count} MEDIUM activity shards - normal operation") + if low_count > 0: + output.append(f" β€’ {low_count} LOW activity shards - occasional writes") + + # Identify patterns + primary_activities = [a for a in activities if a.is_primary] + if len(primary_activities) == len(activities): + output.append(" β€’ All active shards are PRIMARY - normal write pattern") + elif len(primary_activities) < len(activities) * 0.5: + output.append(" β€’ Many REPLICA shards active - possible recovery/replication activity") + + # Node concentration + nodes = {a.node_name for a in activities} + if len(nodes) <= 2: + output.append(f" β€’ Activity concentrated on {len(nodes)} node(s) - consider redistribution") + + return "\n".join(output) diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py index e5e6e834..010f9aeb 100644 --- a/cratedb_toolkit/admin/xmover/cli.py +++ b/cratedb_toolkit/admin/xmover/cli.py @@ -5,12 +5,14 @@ """ import sys +import time from typing import Optional import click from rich.console import Console +from rich.panel import Panel -from cratedb_toolkit.admin.xmover.analysis.shard import ShardAnalyzer, ShardReporter +from cratedb_toolkit.admin.xmover.analysis.shard import ActiveShardMonitor, ShardAnalyzer, ShardReporter from cratedb_toolkit.admin.xmover.analysis.table import DistributionAnalyzer from cratedb_toolkit.admin.xmover.analysis.zone import ZoneReport from cratedb_toolkit.admin.xmover.model import ( @@ -249,6 +251,199 @@ def shard_distribution(ctx, top_tables: int, table: Optional[str]): console.print(f"[dim]{traceback.format_exc()}[/dim]") +@main.command() +@click.option("--count", default=10, help="Number of most active shards to show (default: 10)") +@click.option("--interval", default=30, help="Observation interval in seconds (default: 30)") +@click.option( + "--min-checkpoint-delta", + default=1000, + help="Minimum checkpoint progression between snapshots to show shard (default: 1000)", +) +@click.option("--table", "-t", help="Monitor specific table only") +@click.option("--node", "-n", help="Monitor specific node only") +@click.option("--watch", "-w", is_flag=True, help="Continuously monitor (refresh every interval)") +@click.option("--exclude-system", is_flag=True, help="Exclude system tables (gc.*, information_schema.*)") +@click.option("--min-rate", type=float, help="Minimum activity rate (changes/sec) to show") +@click.option("--show-replicas/--hide-replicas", default=True, help="Show replica shards (default: True)") +@click.pass_context +def active_shards( + ctx, + count: int, + interval: int, + min_checkpoint_delta: int, + table: Optional[str], + node: Optional[str], + watch: bool, + exclude_system: bool, + min_rate: Optional[float], + show_replicas: bool, +): + """Monitor most active shards by checkpoint progression + + This command takes two snapshots of ALL started shards separated by the + observation interval, then shows the shards with the highest checkpoint + progression (activity) between the snapshots. + + Unlike other commands, this tracks ALL shards and filters based on actual + activity between snapshots, not current state. This captures shards that + become active during the observation period. + + Useful for identifying which shards are receiving the most write activity + in your cluster and understanding write patterns. + + Examples: + xmover active-shards --count 20 --interval 60 # Top 20 over 60 seconds + xmover active-shards --watch --interval 30 # Continuous monitoring + xmover active-shards --table my_table --watch # Monitor specific table + xmover active-shards --node data-hot-1 --count 5 # Top 5 on specific node + xmover active-shards --min-checkpoint-delta 500 # Lower activity threshold + xmover active-shards --exclude-system --min-rate 50 # Skip system tables, min 50/sec + xmover active-shards --hide-replicas --count 20 # Only primary shards + """ + client = ctx.obj["client"] + monitor = ActiveShardMonitor(client) + + def get_filtered_snapshot(): + """Get snapshot with optional filtering""" + snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=min_checkpoint_delta) + + # Apply table filter if specified + if table: + snapshots = [s for s in snapshots if s.table_name == table or f"{s.schema_name}.{s.table_name}" == table] + + # Apply node filter if specified + if node: + snapshots = [s for s in snapshots if s.node_name == node] + + # Exclude system tables if requested + if exclude_system: + snapshots = [ + s + for s in snapshots + if not ( + s.schema_name.startswith("gc.") + or s.schema_name == "information_schema" + or s.schema_name == "sys" + or s.table_name.endswith("_events") + or s.table_name.endswith("_log") + ) + ] + + return snapshots + + def run_single_analysis(): + """Run a single analysis cycle""" + if not watch: + console.print(Panel.fit("[bold blue]Active Shards Monitor[/bold blue]")) + + # Show configuration - simplified for watch mode + if watch: + config_parts = [f"{interval}s interval", f"threshold: {min_checkpoint_delta:,}", f"top {count}"] + if table: + config_parts.append(f"table: {table}") + if node: + config_parts.append(f"node: {node}") + console.print(f"[dim]{' | '.join(config_parts)}[/dim]") + else: + config_info = [ + f"Observation interval: {interval}s", + f"Min checkpoint delta: {min_checkpoint_delta:,}", + f"Show count: {count}", + ] + if table: + config_info.append(f"Table filter: {table}") + if node: + config_info.append(f"Node filter: {node}") + if exclude_system: + config_info.append("Excluding system tables") + if min_rate: + config_info.append(f"Min rate: {min_rate}/sec") + if not show_replicas: + config_info.append("Primary shards only") + + console.print("[dim]" + " | ".join(config_info) + "[/dim]") + console.print() + + # Take first snapshot + if not watch: + console.print("πŸ“· Taking first snapshot...") + snapshot1 = get_filtered_snapshot() + + if not snapshot1: + console.print("[yellow]No started shards found matching criteria[/yellow]") + return + + if not watch: + console.print(f" Tracking {len(snapshot1)} started shards for activity") + console.print(f"⏱️ Waiting {interval} seconds for activity...") + + # Wait for observation interval + if watch: + # Simplified countdown for watch mode + for remaining in range(interval, 0, -1): + if remaining % 5 == 0 or remaining <= 3: # Show fewer updates + console.print(f"[dim]⏱️ {remaining}s...[/dim]", end="\r") + time.sleep(1) + console.print(" " * 15, end="\r") # Clear countdown + else: + time.sleep(interval) + + # Take second snapshot + if not watch: + console.print("πŸ“· Taking second snapshot...") + snapshot2 = get_filtered_snapshot() + + if not snapshot2: + console.print("[yellow]No started shards found in second snapshot[/yellow]") + return + + if not watch: + console.print(f" Tracking {len(snapshot2)} started shards for activity") + + # Compare snapshots and show results + activities = monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=min_checkpoint_delta) + + # Apply additional filters + if not show_replicas: + activities = [a for a in activities if a.is_primary] + + if min_rate: + activities = [a for a in activities if a.activity_rate >= min_rate] + + if not activities: + console.print( + f"[green]βœ… No shards exceeded activity threshold ({min_checkpoint_delta:,} checkpoint changes)[/green]" + ) + if min_rate: + console.print(f"[dim]Also filtered by minimum rate: {min_rate}/sec[/dim]") + else: + if not watch: + overlap_count = len({s.shard_identifier for s in snapshot1} & {s.shard_identifier for s in snapshot2}) + console.print(f"[dim]Analyzed {overlap_count} shards present in both snapshots[/dim]") + console.print(monitor.format_activity_display(activities, show_count=count, watch_mode=watch)) + + try: + if watch: + console.print("[dim]Press Ctrl+C to stop monitoring[/dim]") + console.print() + + while True: + run_single_analysis() + if watch: + console.print(f"\n[dim]━━━ Next update in {interval}s ━━━[/dim]\n") + time.sleep(interval) + else: + run_single_analysis() + + except KeyboardInterrupt: + console.print("\n[yellow]Monitoring stopped by user[/yellow]") + except Exception as e: + console.print(f"[red]Error during active shards monitoring: {e}[/red]") + import traceback + + console.print(f"[dim]{traceback.format_exc()}[/dim]") + + @main.command() @click.option("--table", "-t", help="Analyze zones for specific table only") @click.option("--show-shards/--no-show-shards", default=False, help="Show individual shard details (default: False)") diff --git a/cratedb_toolkit/admin/xmover/model.py b/cratedb_toolkit/admin/xmover/model.py index 34e43f77..d8511b31 100644 --- a/cratedb_toolkit/admin/xmover/model.py +++ b/cratedb_toolkit/admin/xmover/model.py @@ -184,3 +184,67 @@ class ShardRelocationConstraints: max_recommendations: int = 10 max_disk_usage: float = 90.0 prioritize_space: bool = False + + +@dataclass +class ActiveShardSnapshot: + """Snapshot of active shard checkpoint data for tracking activity""" + + schema_name: str + table_name: str + shard_id: int + node_name: str + is_primary: bool + partition_ident: str + local_checkpoint: int + global_checkpoint: int + translog_uncommitted_bytes: int + timestamp: float # Unix timestamp when snapshot was taken + + @property + def checkpoint_delta(self) -> int: + """Current checkpoint delta (local - global)""" + return self.local_checkpoint - self.global_checkpoint + + @property + def translog_uncommitted_mb(self) -> float: + """Translog uncommitted size in MB""" + return self.translog_uncommitted_bytes / (1024 * 1024) + + @property + def shard_identifier(self) -> str: + """Unique identifier for this shard including partition""" + shard_type = "P" if self.is_primary else "R" + partition = f":{self.partition_ident}" if self.partition_ident else "" + return f"{self.schema_name}.{self.table_name}:{self.shard_id}:{self.node_name}:{shard_type}{partition}" + + +@dataclass +class ActiveShardActivity: + """Activity comparison between two snapshots of the same shard""" + + schema_name: str + table_name: str + shard_id: int + node_name: str + is_primary: bool + partition_ident: str + local_checkpoint_delta: int # Change in local checkpoint between snapshots + snapshot1: ActiveShardSnapshot + snapshot2: ActiveShardSnapshot + time_diff_seconds: float + + @property + def activity_rate(self) -> float: + """Activity rate as checkpoint changes per second""" + if self.time_diff_seconds > 0: + return self.local_checkpoint_delta / self.time_diff_seconds + return 0.0 + + @property + def shard_type(self) -> str: + return "PRIMARY" if self.is_primary else "REPLICA" + + @property + def table_identifier(self) -> str: + return f"{self.schema_name}.{self.table_name}" diff --git a/cratedb_toolkit/admin/xmover/util/database.py b/cratedb_toolkit/admin/xmover/util/database.py index 21950ab0..5c9011bd 100644 --- a/cratedb_toolkit/admin/xmover/util/database.py +++ b/cratedb_toolkit/admin/xmover/util/database.py @@ -10,7 +10,7 @@ import urllib3 from dotenv import load_dotenv -from cratedb_toolkit.admin.xmover.model import NodeInfo, RecoveryInfo, ShardInfo +from cratedb_toolkit.admin.xmover.model import ActiveShardSnapshot, NodeInfo, RecoveryInfo, ShardInfo logger = logging.getLogger(__name__) @@ -496,3 +496,60 @@ def _is_recovery_completed(self, recovery_info: RecoveryInfo) -> bool: and recovery_info.files_percent >= 100.0 and recovery_info.bytes_percent >= 100.0 ) + + def get_active_shards_snapshot(self, min_checkpoint_delta: int = 1000) -> List[ActiveShardSnapshot]: + """Get a snapshot of all started shards for activity monitoring + + Note: This captures ALL started shards regardless of current activity level. + The min_checkpoint_delta parameter is kept for backwards compatibility but + filtering is now done during snapshot comparison to catch shards that + become active between observations. + + Args: + min_checkpoint_delta: Kept for compatibility - filtering now done in comparison + + Returns: + List of ActiveShardSnapshot objects for all started shards + """ + import time + + query = """ + SELECT sh.schema_name, \ + sh.table_name, \ + sh.id AS shard_id, \ + sh."primary", \ + node['name'] as node_name, \ + sh.partition_ident, \ + sh.translog_stats['uncommitted_size'] AS translog_uncommitted_bytes, \ + sh.seq_no_stats['local_checkpoint'] AS local_checkpoint, \ + sh.seq_no_stats['global_checkpoint'] AS global_checkpoint + FROM sys.shards AS sh + WHERE sh.state = 'STARTED' + ORDER BY sh.schema_name, sh.table_name, sh.id, sh.node['name'] \ + """ + + try: + result = self.execute_query(query) + snapshots = [] + current_time = time.time() + + for row in result.get("rows", []): + snapshot = ActiveShardSnapshot( + schema_name=row[0], + table_name=row[1], + shard_id=row[2], + is_primary=row[3], + node_name=row[4], + partition_ident=row[5] or "", + translog_uncommitted_bytes=row[6] or 0, + local_checkpoint=row[7] or 0, + global_checkpoint=row[8] or 0, + timestamp=current_time, + ) + snapshots.append(snapshot) + + return snapshots + + except Exception as e: + logger.error(f"Error getting active shards snapshot: {e}") + return [] diff --git a/doc/admin/xmover/handbook.md b/doc/admin/xmover/handbook.md index 05a3c57a..f9aee2e0 100644 --- a/doc/admin/xmover/handbook.md +++ b/doc/admin/xmover/handbook.md @@ -244,6 +244,132 @@ xmover monitor-recovery --watch --include-transitioning - **PEER**: Copying shard data from another node (replication/relocation) - **DISK**: Rebuilding shard from local data (after restart/disk issues) + +### `active-shards` +Monitor the most active shards by tracking checkpoint progression over time. +This command helps identify which shards are receiving the most write activity +by measuring local checkpoint progression between two snapshots. + +**Options:** +- `--count`: Number of most active shards to show (default: 10) +- `--interval`: Observation interval in seconds (default: 30) +- `--min-checkpoint-delta`: Minimum checkpoint progression between snapshots to show shard (default: 1000) +- `--table, -t`: Monitor specific table only +- `--node, -n`: Monitor specific node only +- `--watch, -w`: Continuously monitor (refresh every interval) +- `--exclude-system`: Exclude system tables (gc.*, information_schema.*, *_events, *_log) +- `--min-rate`: Minimum activity rate (changes/sec) to show +- `--show-replicas/--hide-replicas`: Show replica shards (default: True) + +**How it works:** +1. **Takes snapshot of ALL started shards** (not just currently active ones) +2. **Waits for observation interval** (configurable, default: 30 seconds) +3. **Takes second snapshot** of all started shards +4. **Compares snapshots** to find shards with checkpoint progression β‰₯ threshold +5. **Shows ranked results** with activity trends and insights + +**Enhanced output features:** +- **Checkpoint visibility**: Shows actual `local_checkpoint` values (CP Start β†’ CP End β†’ Delta) +- **Partition awareness**: Separate tracking for partitioned tables (different partition_ident values) +- **Activity trends**: πŸ”₯ HOT (β‰₯100/s), πŸ“ˆ HIGH (β‰₯50/s), πŸ“Š MED (β‰₯10/s), πŸ“‰ LOW (<10/s) +- **Smart insights**: Identifies concentration patterns and load distribution (non-watch mode) +- **Flexible filtering**: Exclude system tables, set minimum rates, hide replicas +- **Context information**: Total activity, average rates, observation period +- **Clean watch mode**: Streamlined output without legend/insights for continuous monitoring + +This approach captures shards that become active during the observation period, providing a complete view of cluster write patterns and identifying hot spots. The enhanced filtering helps focus on business-critical activity patterns. + +**Sample output (single run):** +``` +πŸ”₯ Most Active Shards (3 shown, 30s observation period) +Total checkpoint activity: 190,314 changes, Average rate: 2,109.0/sec + Rank | Schema.Table | Shard | Partition | Node | Type | Checkpoint Ξ” | Rate/sec | Trend + ----------------------------------------------------------------------------------------------------------- + 1 | gc.scheduled_jobs_log | 0 | - | data-hot-8 | P | 113,744 | 3,791.5 | πŸ”₯ HOT + 2 | TURVO.events | 0 | 04732dpl6osj8d | data-hot-0 | P | 45,837 | 1,527.9 | πŸ”₯ HOT + 3 | doc.user_actions | 1 | 04732dpk70rj6d | data-hot-2 | P | 30,733 | 1,024.4 | πŸ”₯ HOT +Legend: + β€’ Checkpoint Ξ”: Write operations during observation period + β€’ Partition: partition_ident (truncated if >14 chars, '-' if none) +Insights: + β€’ 3 HOT shards (β‰₯100 changes/sec) - consider load balancing + β€’ All active shards are PRIMARY - normal write pattern +``` + +**Sample output (watch mode - cleaner):** +``` +30s interval | threshold: 1,000 | top 5 +πŸ”₯ Most Active Shards (3 shown, 30s observation period) +Total checkpoint activity: 190,314 changes, Average rate: 2,109.0/sec + Rank | Schema.Table | Shard | Partition | Node | Type | Checkpoint Ξ” | Rate/sec | Trend + ----------------------------------------------------------------------------------------------------------- + 1 | gc.scheduled_jobs_log | 0 | - | data-hot-8 | P | 113,744 | 3,791.5 | πŸ”₯ HOT + 2 | TURVO.events | 0 | 04732dpl6osj8d | data-hot-0 | P | 45,837 | 1,527.9 | πŸ”₯ HOT + 3 | doc.user_actions | 1 | 04732dpk70rj6d | data-hot-2 | P | 30,733 | 1,024.4 | πŸ”₯ HOT +━━━ Next update in 30s ━━━ +``` + +#### Examples +```bash +# Show top 10 most active shards over 30 seconds +xmover active-shards + +# Top 20 shards with 60-second observation period +xmover active-shards --count 20 --interval 60 + +# Continuous monitoring with 30-second intervals +xmover active-shards --watch --interval 30 + +# Monitor specific table activity +xmover active-shards --table my_table --watch + +# Monitor specific node with custom threshold +xmover active-shards --node data-hot-1 --min-checkpoint-delta 500 + +# Exclude system tables and event logs for business data focus +xmover active-shards --exclude-system --count 20 + +# Only show high-activity shards (β‰₯50 changes/sec) +xmover active-shards --min-rate 50 --count 15 + +# Focus on primary shards only +xmover active-shards --hide-replicas --count 20 +``` + +#### Monitoring Active Shards and Write Patterns + +Identify which shards are receiving the most write activity: + +1. Quick snapshot of most active shards: +```bash +# Show top 10 most active shards over 30 seconds +xmover active-shards + +# Longer observation period for more accurate results +xmover active-shards --count 15 --interval 60 +``` + +2. Continuous monitoring for real-time insights: +```bash +# Continuous monitoring with 30-second intervals +xmover active-shards --watch --interval 30 + +# Monitor specific table for focused analysis +xmover active-shards --table critical_table --watch +``` + +3. Integration with rebalancing workflow: +```bash +# Identify hot shards first +xmover active-shards --count 20 --interval 60 + +# Move hot shards away from overloaded nodes +xmover recommend --table hot_table --prioritize-space --execute + +# Monitor the impact +xmover active-shards --table hot_table --watch +``` + ### `test-connection` Tests the connection to CrateDB and displays basic cluster information. diff --git a/pyproject.toml b/pyproject.toml index fa3309d5..762d4fdf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -325,6 +325,7 @@ lint.extend-ignore = [ ] lint.per-file-ignores."cratedb_toolkit/admin/xmover/analysis/shard.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."tests/admin/*" = [ "T201" ] # Allow use of `print`. lint.per-file-ignores."cratedb_toolkit/retention/cli.py" = [ "T201" ] # Allow `print` lint.per-file-ignores."cratedb_toolkit/sqlalchemy/__init__.py" = [ "F401" ] # Allow `moduleΒ΄ imported but unused lint.per-file-ignores."doc/conf.py" = [ "A001", "ERA001" ] diff --git a/tests/admin/test_active_shard_monitor.py b/tests/admin/test_active_shard_monitor.py new file mode 100644 index 00000000..55268b15 --- /dev/null +++ b/tests/admin/test_active_shard_monitor.py @@ -0,0 +1,472 @@ +""" +Tests for ActiveShardMonitor functionality +""" + +import time +from unittest.mock import Mock, patch + +from cratedb_toolkit.admin.xmover.analysis.shard import ActiveShardMonitor +from cratedb_toolkit.admin.xmover.model import ActiveShardActivity, ActiveShardSnapshot +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient + + +class TestActiveShardSnapshot: + """Test ActiveShardSnapshot dataclass""" + + def test_checkpoint_delta(self): + """Test checkpoint delta calculation""" + snapshot = ActiveShardSnapshot( + schema_name="test_schema", + table_name="test_table", + shard_id=1, + node_name="node1", + is_primary=True, + partition_ident="", + local_checkpoint=1500, + global_checkpoint=500, + translog_uncommitted_bytes=10485760, # 10MB + timestamp=time.time(), + ) + + assert snapshot.checkpoint_delta == 1000 + assert snapshot.translog_uncommitted_mb == 10.0 + assert snapshot.shard_identifier == "test_schema.test_table:1:node1:P" + + +class TestActiveShardActivity: + """Test ActiveShardActivity dataclass""" + + def test_activity_calculations(self): + """Test activity rate and property calculations""" + snapshot1 = ActiveShardSnapshot( + schema_name="test_schema", + table_name="test_table", + shard_id=1, + node_name="node1", + is_primary=True, + partition_ident="", + local_checkpoint=1000, + global_checkpoint=500, + translog_uncommitted_bytes=5242880, # 5MB + timestamp=100.0, + ) + + snapshot2 = ActiveShardSnapshot( + schema_name="test_schema", + table_name="test_table", + shard_id=1, + node_name="node1", + is_primary=True, + partition_ident="", + local_checkpoint=1500, + global_checkpoint=500, + translog_uncommitted_bytes=10485760, # 10MB + timestamp=130.0, # 30 seconds later + ) + + activity = ActiveShardActivity( + schema_name="test_schema", + table_name="test_table", + shard_id=1, + node_name="node1", + is_primary=True, + partition_ident="", + local_checkpoint_delta=500, + snapshot1=snapshot1, + snapshot2=snapshot2, + time_diff_seconds=30.0, + ) + + assert activity.activity_rate == 500 / 30.0 # ~16.67 changes/sec + assert activity.shard_type == "PRIMARY" + assert activity.table_identifier == "test_schema.test_table" + + +class TestCrateDBClientActiveShards: + """Test CrateDB client active shards functionality""" + + @patch.object(CrateDBClient, "execute_query") + def test_get_active_shards_snapshot_success(self, mock_execute): + """Test successful snapshot retrieval""" + mock_execute.return_value = { + "rows": [ + ["schema1", "table1", 1, True, "node1", "", 10485760, 1500, 500], + ["schema1", "table2", 2, False, "node2", "part1", 20971520, 2000, 800], + ] + } + + client = CrateDBClient("http://test") + snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=1000) + + assert len(snapshots) == 2 + + # Check first snapshot + snap1 = snapshots[0] + assert snap1.schema_name == "schema1" + assert snap1.table_name == "table1" + assert snap1.shard_id == 1 + assert snap1.is_primary is True + assert snap1.node_name == "node1" + assert snap1.local_checkpoint == 1500 + assert snap1.global_checkpoint == 500 + assert snap1.checkpoint_delta == 1000 + assert snap1.translog_uncommitted_mb == 10.0 + + # Check second snapshot + snap2 = snapshots[1] + assert snap2.schema_name == "schema1" + assert snap2.table_name == "table2" + assert snap2.shard_id == 2 + assert snap2.is_primary is False + assert snap2.node_name == "node2" + assert snap2.partition_ident == "part1" + assert snap2.checkpoint_delta == 1200 + assert snap2.translog_uncommitted_mb == 20.0 + + # Verify query was called without checkpoint delta filter (new behavior) + mock_execute.assert_called_once() + args = mock_execute.call_args[0] + # No longer passes min_checkpoint_delta parameter + assert len(args) == 1 # Only the query, no parameters + + @patch.object(CrateDBClient, "execute_query") + def test_get_active_shards_snapshot_empty(self, mock_execute): + """Test snapshot retrieval with no results""" + mock_execute.return_value = {"rows": []} + + client = CrateDBClient("http://test") + snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=1000) + + assert snapshots == [] + + @patch.object(CrateDBClient, "execute_query") + def test_get_active_shards_snapshot_error(self, mock_execute): + """Test snapshot retrieval with database error""" + mock_execute.side_effect = Exception("Database connection failed") + + client = CrateDBClient("http://test") + snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=1000) + + assert snapshots == [] + + +class TestActiveShardMonitor: + """Test ActiveShardMonitor class""" + + def setup_method(self): + """Set up test fixtures""" + self.mock_client = Mock(spec=CrateDBClient) + self.monitor = ActiveShardMonitor(self.mock_client) + + def create_test_snapshot( + self, + schema: str, + table: str, + shard_id: int, + node: str, + is_primary: bool, + local_checkpoint: int, + timestamp: float, + ): + """Helper to create test snapshots""" + return ActiveShardSnapshot( + schema_name=schema, + table_name=table, + shard_id=shard_id, + node_name=node, + is_primary=is_primary, + partition_ident="", + local_checkpoint=local_checkpoint, + global_checkpoint=500, # Fixed for simplicity + translog_uncommitted_bytes=10485760, # 10MB + timestamp=timestamp, + ) + + def test_compare_snapshots_with_activity(self): + """Test comparing snapshots with active shards""" + # Create first snapshot + snapshot1 = [ + self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0), + self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2000, 100.0), + self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3000, 100.0), + ] + + # Create second snapshot (30 seconds later with activity) + snapshot2 = [ + self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1500, 130.0), # +500 + self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2200, 130.0), # +200 + self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3000, 130.0), # No change + self.create_test_snapshot("schema1", "table4", 1, "node3", True, 1000, 130.0), # New shard + ] + + activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1) + + # Should have 2 activities (table3 had no change, table4 is new) + assert len(activities) == 2 + + # Check activities are sorted by checkpoint delta (highest first) + assert activities[0].local_checkpoint_delta == 500 # table1 + assert activities[0].schema_name == "schema1" + assert activities[0].table_name == "table1" + + assert activities[1].local_checkpoint_delta == 200 # table2 + assert activities[1].schema_name == "schema1" + assert activities[1].table_name == "table2" + + # Check activity rate calculation + assert activities[0].activity_rate == 500 / 30.0 # ~16.67/sec + assert activities[1].activity_rate == 200 / 30.0 # ~6.67/sec + + def test_compare_snapshots_no_activity(self): + """Test comparing snapshots with no activity""" + # Create identical snapshots + snapshot1 = [ + self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0), + ] + + snapshot2 = [ + self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 130.0), # No change + ] + + activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1) + + assert activities == [] + + def test_compare_snapshots_no_overlap(self): + """Test comparing snapshots with no overlapping shards""" + snapshot1 = [ + self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0), + ] + + snapshot2 = [ + self.create_test_snapshot("schema1", "table2", 1, "node2", True, 1500, 130.0), # Different shard + ] + + activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1) + + assert activities == [] + + def test_format_activity_display_with_activities(self): + """Test formatting activity display with data""" + # Create test activities + snapshot1 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0) + snapshot2 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1500, 130.0) + + activity = ActiveShardActivity( + schema_name="schema1", + table_name="table1", + shard_id=1, + node_name="node1", + is_primary=True, + partition_ident="", + local_checkpoint_delta=500, + snapshot1=snapshot1, + snapshot2=snapshot2, + time_diff_seconds=30.0, + ) + + display = self.monitor.format_activity_display([activity], show_count=10, watch_mode=False) + + # Check that output contains expected elements + assert "Most Active Shards" in display + assert "schema1.table1" in display + assert "500" in display # checkpoint delta + assert "16.7" in display # activity rate + assert "P" in display # primary indicator + assert "Legend:" in display + assert "Trend:" in display # new trend column explanation + assert "Partition:" in display # new partition column explanation + + def test_format_activity_display_empty(self): + """Test formatting activity display with no data""" + display = self.monitor.format_activity_display([], show_count=10, watch_mode=False) + + assert "No active shards with significant checkpoint progression found" in display + + def test_format_activity_display_count_limit(self): + """Test that display respects show_count limit""" + # Create multiple activities + activities = [] + for i in range(15): + snapshot1 = self.create_test_snapshot("schema1", f"table{i}", 1, "node1", True, 1000, 100.0) + snapshot2 = self.create_test_snapshot("schema1", f"table{i}", 1, "node1", True, 1000 + (i + 1) * 100, 130.0) + + activity = ActiveShardActivity( + schema_name="schema1", + table_name=f"table{i}", + shard_id=1, + node_name="node1", + is_primary=True, + partition_ident="", + local_checkpoint_delta=(i + 1) * 100, + snapshot1=snapshot1, + snapshot2=snapshot2, + time_diff_seconds=30.0, + ) + activities.append(activity) + + # Sort activities by checkpoint delta (highest first) - same as compare_snapshots does + activities.sort(key=lambda x: x.local_checkpoint_delta, reverse=True) + + # Should only show top 5 + display = self.monitor.format_activity_display(activities, show_count=5, watch_mode=False) + + # Count number of table entries in display + table_count = display.count("schema1.table") + assert table_count == 5 # Should only show 5 entries + + # Should show highest activity first (table14 has highest checkpoint delta) + assert "schema1.table14" in display + + def test_compare_snapshots_with_activity_threshold(self): + """Test filtering activities by minimum threshold""" + # Create snapshots with various activity levels + snapshot1 = [ + self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0), # Will have +2000 delta + self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2000, 100.0), # Will have +500 delta + self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3000, 100.0), # Will have +100 delta + ] + + snapshot2 = [ + self.create_test_snapshot("schema1", "table1", 1, "node1", True, 3000, 130.0), # +2000 delta + self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2500, 130.0), # +500 delta + self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3100, 130.0), # +100 delta + ] + + # Test with threshold of 1000 - should only show table1 (2000 delta) + activities_high_threshold = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1000) + assert len(activities_high_threshold) == 1 + assert activities_high_threshold[0].table_name == "table1" + assert activities_high_threshold[0].local_checkpoint_delta == 2000 + + # Test with threshold of 200 - should show table1 and table2 + activities_medium_threshold = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=200) + assert len(activities_medium_threshold) == 2 + assert activities_medium_threshold[0].local_checkpoint_delta == 2000 # table1 first (highest) + assert activities_medium_threshold[1].local_checkpoint_delta == 500 # table2 second + + # Test with threshold of 0 - should show all three + activities_low_threshold = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=0) + assert len(activities_low_threshold) == 3 + assert activities_low_threshold[0].local_checkpoint_delta == 2000 # Sorted by activity + assert activities_low_threshold[1].local_checkpoint_delta == 500 + assert activities_low_threshold[2].local_checkpoint_delta == 100 + + def test_primary_replica_separation(self): + """Test that primary and replica shards are tracked separately""" + # Create snapshots with same table/shard but different primary/replica + snapshot1 = [ + # Primary shard + self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", True, 15876, 100.0), + # Replica shard (same table/shard/node but different type) + self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", False, 129434, 100.0), + ] + + snapshot2 = [ + # Primary shard progresses normally + self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", True, 16000, 130.0), # +124 delta + # Replica shard progresses normally + self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", False, 129500, 130.0), # +66 delta + ] + + activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1) + + # Should have 2 separate activities (primary and replica tracked separately) + assert len(activities) == 2 + + # Find primary and replica activities + primary_activity = next(a for a in activities if a.is_primary) + replica_activity = next(a for a in activities if not a.is_primary) + + # Verify deltas are calculated correctly for each type + assert primary_activity.local_checkpoint_delta == 124 # 16000 - 15876 + assert replica_activity.local_checkpoint_delta == 66 # 129500 - 129434 + + # Verify they have different shard identifiers + assert primary_activity.snapshot1.shard_identifier != replica_activity.snapshot1.shard_identifier + assert "data-hot-8:P" in primary_activity.snapshot1.shard_identifier + assert "data-hot-8:R" in replica_activity.snapshot1.shard_identifier + + # This test prevents the bug where we mixed primary CP End with replica CP Start + # which created fake deltas like 129434 - 15876 = 113558 + + def test_partition_separation(self): + """Test that partitions within the same table/shard are tracked separately""" + # Create snapshots with same table/shard but different partitions + snapshot1 = [ + # Partition 1 + self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 32684, 100.0), + # Partition 2 (same table/shard/node/type but different partition) + self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 54289, 100.0), + ] + + # Modify partition_ident for the snapshots to simulate different partitions + snapshot1[0].partition_ident = "04732dpl6osj8d1g60o30c1g" + snapshot1[1].partition_ident = "04732dpl6os3adpm60o30c1g" + + snapshot2 = [ + # Partition 1 progresses + self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 32800, 130.0), + # +116 delta + # Partition 2 progresses + self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 54400, 130.0), + # +111 delta + ] + + # Set partition_ident for second snapshot + snapshot2[0].partition_ident = "04732dpl6osj8d1g60o30c1g" + snapshot2[1].partition_ident = "04732dpl6os3adpm60o30c1g" + + activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1) + + # Should have 2 separate activities (partitions tracked separately) + assert len(activities) == 2 + + # Verify deltas are calculated correctly for each partition + partition1_activity = next(a for a in activities if "04732dpl6osj8d1g60o30c1g" in a.snapshot1.shard_identifier) + partition2_activity = next(a for a in activities if "04732dpl6os3adpm60o30c1g" in a.snapshot1.shard_identifier) + + assert partition1_activity.local_checkpoint_delta == 116 # 32800 - 32684 + assert partition2_activity.local_checkpoint_delta == 111 # 54400 - 54289 + + # Verify they have different shard identifiers due to partition + assert partition1_activity.snapshot1.shard_identifier != partition2_activity.snapshot1.shard_identifier + assert ":04732dpl6osj8d1g60o30c1g" in partition1_activity.snapshot1.shard_identifier + assert ":04732dpl6os3adpm60o30c1g" in partition2_activity.snapshot1.shard_identifier + + # This test prevents mixing partitions which would create fake activity measurements + + def test_format_activity_display_watch_mode(self): + """Test that watch mode excludes legend and insights""" + snapshot1 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0) + snapshot2 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1500, 130.0) + + activity = ActiveShardActivity( + schema_name="schema1", + table_name="table1", + shard_id=1, + node_name="node1", + is_primary=True, + partition_ident="", + local_checkpoint_delta=500, + snapshot1=snapshot1, + snapshot2=snapshot2, + time_diff_seconds=30.0, + ) + + # Test non-watch mode (should include legend and insights) + normal_display = self.monitor.format_activity_display([activity], show_count=10, watch_mode=False) + assert "Legend:" in normal_display + assert "Insights:" in normal_display + assert "Checkpoint Ξ”:" in normal_display + + # Test watch mode (should exclude legend and insights) + watch_display = self.monitor.format_activity_display([activity], show_count=10, watch_mode=True) + assert "Legend:" not in watch_display + assert "Insights:" not in watch_display + assert "Checkpoint Ξ”" in watch_display # Core data should still be present + + # But should still contain the core data + assert "Most Active Shards" in watch_display + assert "schema1.table1" in watch_display + assert "500" in watch_display # checkpoint delta diff --git a/tests/admin/test_distribution_analyzer.py b/tests/admin/test_distribution_analyzer.py new file mode 100644 index 00000000..000fd0f9 --- /dev/null +++ b/tests/admin/test_distribution_analyzer.py @@ -0,0 +1,294 @@ +""" +Tests for distribution analyzer functionality +""" + +from unittest.mock import Mock, patch + +from cratedb_toolkit.admin.xmover.analysis.table import DistributionAnalyzer, DistributionAnomaly, TableDistribution +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient + + +class TestDistributionAnalyzer: + def setup_method(self): + """Set up test fixtures""" + self.mock_client = Mock(spec=CrateDBClient) + self.analyzer = DistributionAnalyzer(self.mock_client) + + def test_coefficient_of_variation_calculation(self): + """Test CV calculation with different scenarios""" + + # Normal case + values = [10, 12, 8, 14, 6] + cv = self.analyzer.calculate_coefficient_of_variation(values) + assert cv > 0 + + # All equal values (should return 0) + equal_values = [10, 10, 10, 10] + cv_equal = self.analyzer.calculate_coefficient_of_variation(equal_values) + assert cv_equal == 0.0 + + # Empty list + empty_values = [] + cv_empty = self.analyzer.calculate_coefficient_of_variation(empty_values) + assert cv_empty == 0.0 + + # Single value + single_value = [10] + cv_single = self.analyzer.calculate_coefficient_of_variation(single_value) + assert cv_single == 0.0 + + def test_get_largest_tables_distribution(self): + """Test fetching table distribution data""" + + # Mock query results + mock_results = [ + # schema, table, node, primary_shards, replica_shards, total_shards, total_size, primary_size, replica_size, docs # noqa: E501, ERA001 + ["doc", "large_table", "node1", 5, 2, 7, 100.5, 80.2, 20.3, 1000000], + ["doc", "large_table", "node2", 4, 3, 7, 95.1, 75.8, 19.3, 950000], + ["doc", "large_table", "node3", 6, 1, 7, 110.2, 85.9, 24.3, 1100000], + ["custom", "another_table", "node1", 3, 2, 5, 50.1, 40.2, 9.9, 500000], + ["custom", "another_table", "node2", 2, 3, 5, 45.8, 35.1, 10.7, 480000], + ] + + self.mock_client.execute_query.return_value = mock_results + + distributions = self.analyzer.get_largest_tables_distribution(top_n=10) + + # Verify query was called with correct parameters + self.mock_client.execute_query.assert_called_once() + call_args = self.mock_client.execute_query.call_args + assert call_args[0][1] == [10] # top_n parameter + + # Verify we got the expected number of tables + assert len(distributions) == 2 + + # Verify table data structure + large_table = next(d for d in distributions if d.table_name == "large_table") + assert large_table.schema_name == "doc" + assert large_table.full_table_name == "large_table" # Should omit 'doc' schema + assert len(large_table.node_distributions) == 3 + + another_table = next(d for d in distributions if d.table_name == "another_table") + assert another_table.schema_name == "custom" + assert another_table.full_table_name == "custom.another_table" + assert len(another_table.node_distributions) == 2 + + # Verify sorting by primary size (descending) + assert distributions[0].total_primary_size_gb >= distributions[1].total_primary_size_gb + + def test_detect_shard_count_imbalance(self): + """Test shard count imbalance detection""" + + # Create test table with imbalanced shard distribution + imbalanced_table = TableDistribution( + schema_name="doc", + table_name="imbalanced_table", + total_primary_size_gb=500.0, + node_distributions={ + "node1": {"total_shards": 10, "primary_shards": 5, "replica_shards": 5}, + "node2": {"total_shards": 15, "primary_shards": 8, "replica_shards": 7}, + "node3": {"total_shards": 5, "primary_shards": 2, "replica_shards": 3}, + }, + ) + + anomaly = self.analyzer.detect_shard_count_imbalance(imbalanced_table) + + assert anomaly is not None + assert anomaly.anomaly_type == "Shard Count Imbalance" + assert anomaly.combined_score > 0 + assert len(anomaly.recommendations) > 0 + + # Create balanced table (should not detect anomaly) + balanced_table = TableDistribution( + schema_name="doc", + table_name="balanced_table", + total_primary_size_gb=100.0, + node_distributions={ + "node1": {"total_shards": 8, "primary_shards": 4, "replica_shards": 4}, + "node2": {"total_shards": 8, "primary_shards": 4, "replica_shards": 4}, + "node3": {"total_shards": 8, "primary_shards": 4, "replica_shards": 4}, + }, + ) + + no_anomaly = self.analyzer.detect_shard_count_imbalance(balanced_table) + assert no_anomaly is None + + def test_detect_storage_imbalance(self): + """Test storage imbalance detection""" + + # Create test table with storage imbalance + storage_imbalanced_table = TableDistribution( + schema_name="doc", + table_name="storage_imbalanced", + total_primary_size_gb=300.0, + node_distributions={ + "node1": {"total_size_gb": 150.0, "primary_size_gb": 100.0, "replica_size_gb": 50.0}, + "node2": {"total_size_gb": 50.0, "primary_size_gb": 30.0, "replica_size_gb": 20.0}, + "node3": {"total_size_gb": 100.0, "primary_size_gb": 70.0, "replica_size_gb": 30.0}, + }, + ) + + anomaly = self.analyzer.detect_storage_imbalance(storage_imbalanced_table) + + assert anomaly is not None + assert anomaly.anomaly_type == "Storage Imbalance" + assert anomaly.combined_score > 0 + + # Small table (should be ignored) + small_table = TableDistribution( + schema_name="doc", + table_name="small_table", + total_primary_size_gb=0.1, + node_distributions={ + "node1": {"total_size_gb": 0.5, "primary_size_gb": 0.05, "replica_size_gb": 0.05}, + "node2": {"total_size_gb": 0.1, "primary_size_gb": 0.03, "replica_size_gb": 0.02}, + }, + ) + + no_anomaly = self.analyzer.detect_storage_imbalance(small_table) + assert no_anomaly is None + + def test_detect_node_coverage_issues(self): + """Test node coverage issue detection""" + + # Mock nodes_info to simulate cluster with 4 nodes + mock_nodes = [Mock(name="node1"), Mock(name="node2"), Mock(name="node3"), Mock(name="node4")] + self.mock_client.get_nodes_info.return_value = mock_nodes + + # Table with limited coverage (only on 2 out of 4 nodes) + limited_coverage_table = TableDistribution( + schema_name="doc", + table_name="limited_coverage", + total_primary_size_gb=100.0, # Significant size + node_distributions={ + "node1": {"total_shards": 10, "primary_shards": 5, "replica_shards": 5}, + "node2": {"total_shards": 10, "primary_shards": 5, "replica_shards": 5}, + # node3 and node4 missing + }, + ) + + anomaly = self.analyzer.detect_node_coverage_issues(limited_coverage_table) + + assert anomaly is not None + assert anomaly.anomaly_type == "Node Coverage Issue" + assert "node3" in anomaly.details["nodes_without_shards"] + assert "node4" in anomaly.details["nodes_without_shards"] + assert len(anomaly.recommendations) > 0 + + def test_detect_document_imbalance(self): + """Test document imbalance detection""" + + # Table with document imbalance + doc_imbalanced_table = TableDistribution( + schema_name="doc", + table_name="doc_imbalanced", + total_primary_size_gb=200.0, + node_distributions={ + "node1": {"total_documents": 1000000}, # 1M docs + "node2": {"total_documents": 500000}, # 500K docs + "node3": {"total_documents": 100000}, # 100K docs (5x imbalance) + }, + ) + + anomaly = self.analyzer.detect_document_imbalance(doc_imbalanced_table) + + assert anomaly is not None + assert anomaly.anomaly_type == "Document Imbalance" + assert "data skew" in anomaly.recommendations[0].lower() + + # Table with very few documents (should be ignored) + low_doc_table = TableDistribution( + schema_name="doc", + table_name="low_docs", + total_primary_size_gb=100.0, + node_distributions={ + "node1": {"total_documents": 1000}, + "node2": {"total_documents": 500}, + }, + ) + + no_anomaly = self.analyzer.detect_document_imbalance(low_doc_table) + assert no_anomaly is None + + def test_analyze_distribution_integration(self): + """Test the full analysis workflow""" + + # Mock the get_largest_tables_distribution method + mock_table = TableDistribution( + schema_name="doc", + table_name="test_table", + total_primary_size_gb=500.0, + node_distributions={ + "node1": { + "total_shards": 15, + "primary_shards": 8, + "replica_shards": 7, + "total_size_gb": 200.0, + "primary_size_gb": 120.0, + "replica_size_gb": 80.0, + "total_documents": 2000000, + }, + "node2": { + "total_shards": 8, + "primary_shards": 4, + "replica_shards": 4, + "total_size_gb": 100.0, + "primary_size_gb": 60.0, + "replica_size_gb": 40.0, + "total_documents": 1000000, + }, + "node3": { + "total_shards": 5, + "primary_shards": 3, + "replica_shards": 2, + "total_size_gb": 50.0, + "primary_size_gb": 30.0, + "replica_size_gb": 20.0, + "total_documents": 500000, + }, + }, + ) + + with patch.object(self.analyzer, "get_largest_tables_distribution", return_value=[mock_table]): + anomalies, tables_analyzed = self.analyzer.analyze_distribution(top_tables=10) + + # Should detect multiple types of anomalies + assert len(anomalies) > 0 + assert tables_analyzed == 1 # We provided 1 mock table + + # Anomalies should be sorted by combined score (descending) + if len(anomalies) > 1: + for i in range(len(anomalies) - 1): + assert anomalies[i].combined_score >= anomalies[i + 1].combined_score + + # Each anomaly should have required fields + for anomaly in anomalies: + assert anomaly.table is not None + assert anomaly.anomaly_type is not None + assert anomaly.combined_score >= 0 + assert isinstance(anomaly.recommendations, list) + + def test_format_distribution_report_no_anomalies(self): + """Test report formatting when no anomalies found""" + + # This should not raise an exception + with patch("builtins.print"): # Mock print to avoid console output during tests + self.analyzer.format_distribution_report([], 5) + + def test_format_distribution_report_with_anomalies(self): + """Test report formatting with anomalies""" + + mock_anomaly = DistributionAnomaly( + table=TableDistribution("doc", "test_table", 100.0, {}), + anomaly_type="Test Anomaly", + severity_score=7.5, + impact_score=8.0, + combined_score=60.0, + description="Test description", + details={}, + recommendations=["Test recommendation"], + ) + + # This should not raise an exception + with patch("builtins.print"): # Mock print to avoid console output during tests + self.analyzer.format_distribution_report([mock_anomaly], 3) diff --git a/tests/admin/test_recovery_monitor.py b/tests/admin/test_recovery_monitor.py new file mode 100644 index 00000000..c6d8a178 --- /dev/null +++ b/tests/admin/test_recovery_monitor.py @@ -0,0 +1,296 @@ +""" +Test script for XMover recovery monitoring functionality + +This script tests the recovery monitoring features by creating mock recovery scenarios +and verifying the output formatting and data parsing. +""" + +import os +import sys +from typing import Any, Dict +from unittest.mock import Mock + +from cratedb_toolkit.admin.xmover.model import RecoveryInfo +from cratedb_toolkit.admin.xmover.operational.monitor import RecoveryMonitor +from cratedb_toolkit.admin.xmover.util.database import CrateDBClient + +# Add the src directory to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) + + +def create_mock_allocation( + schema_name: str, table_name: str, shard_id: int, current_state: str, node_id: str +) -> Dict[str, Any]: + """Create a mock allocation response""" + return { + "schema_name": schema_name, + "table_name": table_name, + "shard_id": shard_id, + "current_state": current_state, + "node_id": node_id, + "explanation": None, + } + + +def create_mock_shard_detail( + schema_name: str, + table_name: str, + shard_id: int, + node_name: str, + node_id: str, + recovery_type: str, + stage: str, + files_percent: float, + bytes_percent: float, + total_time: int, + size: int, + is_primary: bool, +) -> Dict[str, Any]: + """Create a mock shard detail response""" + return { + "schema_name": schema_name, + "table_name": table_name, + "shard_id": shard_id, + "node_name": node_name, + "node_id": node_id, + "routing_state": "RELOCATING", + "state": "RECOVERING", + "recovery": { + "type": recovery_type, + "stage": stage, + "files": {"percent": files_percent}, + "size": {"percent": bytes_percent}, + "total_time": total_time, + }, + "size": size, + "primary": is_primary, + } + + +def test_recovery_info_parsing(): + """Test RecoveryInfo dataclass and its properties""" + print("Testing RecoveryInfo parsing...") + + recovery = RecoveryInfo( + schema_name="CURVO", + table_name="PartioffD", + shard_id=19, + node_name="data-hot-1", + node_id="ZH6fBanGSjanGqeSh-sw0A", + recovery_type="PEER", + stage="DONE", + files_percent=100.0, + bytes_percent=100.0, + total_time_ms=1555907, + routing_state="RELOCATING", + current_state="RELOCATING", + is_primary=False, + size_bytes=56565284209, + ) + + # Test properties + assert recovery.overall_progress == 100.0, f"Expected 100.0, got {recovery.overall_progress}" + assert abs(recovery.size_gb - 52.681) < 0.01, f"Expected ~52.681, got {recovery.size_gb:.3f}" + assert recovery.shard_type == "REPLICA", f"Expected REPLICA, got {recovery.shard_type}" + assert recovery.total_time_seconds == 1555.907, f"Expected 1555.907, got {recovery.total_time_seconds}" + + print("βœ… RecoveryInfo parsing tests passed") + + +def test_database_client_parsing(): + """Test database client recovery parsing logic""" + print("Testing database client recovery parsing...") + + # Create a real client instance to test the parsing method + client = CrateDBClient.__new__(CrateDBClient) # Create without calling __init__ + + # Create test data + allocation = create_mock_allocation("CURVO", "PartioffD", 19, "RELOCATING", "node1") + shard_detail = create_mock_shard_detail( + "CURVO", "PartioffD", 19, "data-hot-1", "node1", "PEER", "DONE", 100.0, 100.0, 1555907, 56565284209, False + ) + + # Test the parsing method directly + recovery_info = client._parse_recovery_info(allocation, shard_detail) + + assert recovery_info.recovery_type == "PEER" + assert recovery_info.stage == "DONE" + assert recovery_info.overall_progress == 100.0 + + print("βœ… Database client parsing tests passed") + + +def test_recovery_monitor_formatting(): + """Test recovery monitor display formatting""" + print("Testing recovery monitor formatting...") + + # Create mock client + mock_client = Mock(spec=CrateDBClient) + monitor = RecoveryMonitor(mock_client) + + # Create test recovery data + recoveries = [ + RecoveryInfo( + schema_name="CURVO", + table_name="PartioffD", + shard_id=19, + node_name="data-hot-1", + node_id="node1", + recovery_type="PEER", + stage="DONE", + files_percent=100.0, + bytes_percent=100.0, + total_time_ms=1555907, + routing_state="RELOCATING", + current_state="RELOCATING", + is_primary=False, + size_bytes=56565284209, + ), + RecoveryInfo( + schema_name="CURVO", + table_name="orderTracking", + shard_id=7, + node_name="data-hot-2", + node_id="node2", + recovery_type="DISK", + stage="INDEX", + files_percent=75.5, + bytes_percent=67.8, + total_time_ms=890234, + routing_state="INITIALIZING", + current_state="INITIALIZING", + is_primary=True, + size_bytes=25120456789, + ), + ] + + # Test summary generation + summary = monitor.get_recovery_summary(recoveries) + + assert summary["total_recoveries"] == 2 + assert "PEER" in summary["by_type"] + assert "DISK" in summary["by_type"] + assert summary["by_type"]["PEER"]["count"] == 1 + assert summary["by_type"]["DISK"]["count"] == 1 + + # Test display formatting + display_output = monitor.format_recovery_display(recoveries) + + assert "Active Shard Recoveries (2 total)" in display_output + assert "PEER Recoveries (1)" in display_output + assert "DISK Recoveries (1)" in display_output + assert "PartioffD" in display_output + assert "orderTracking" in display_output + + print("βœ… Recovery monitor formatting tests passed") + + +def test_empty_recovery_handling(): + """Test handling of no active recoveries""" + print("Testing empty recovery handling...") + + mock_client = Mock(spec=CrateDBClient) + monitor = RecoveryMonitor(mock_client) + + # Test empty list + empty_recoveries = [] + + summary = monitor.get_recovery_summary(empty_recoveries) + assert summary["total_recoveries"] == 0 + assert summary["by_type"] == {} + + display_output = monitor.format_recovery_display(empty_recoveries) + assert "No active shard recoveries found" in display_output + + print("βœ… Empty recovery handling tests passed") + + +def test_recovery_type_filtering(): + """Test filtering by recovery type""" + print("Testing recovery type filtering...") + + mock_client = Mock(spec=CrateDBClient) + + # Mock the get_all_recovering_shards method + mock_recoveries = [ + RecoveryInfo( + schema_name="test", + table_name="table1", + shard_id=1, + node_name="node1", + node_id="n1", + recovery_type="PEER", + stage="DONE", + files_percent=100.0, + bytes_percent=100.0, + total_time_ms=1000, + routing_state="RELOCATING", + current_state="RELOCATING", + is_primary=True, + size_bytes=1000000, + ), + RecoveryInfo( + schema_name="test", + table_name="table2", + shard_id=2, + node_name="node2", + node_id="n2", + recovery_type="DISK", + stage="INDEX", + files_percent=50.0, + bytes_percent=45.0, + total_time_ms=2000, + routing_state="INITIALIZING", + current_state="INITIALIZING", + is_primary=False, + size_bytes=2000000, + ), + ] + + mock_client.get_all_recovering_shards.return_value = mock_recoveries + + monitor = RecoveryMonitor(mock_client) + + # Test filtering + peer_only = monitor.get_cluster_recovery_status(recovery_type_filter="PEER") + assert len(peer_only) == 1 + assert peer_only[0].recovery_type == "PEER" + + disk_only = monitor.get_cluster_recovery_status(recovery_type_filter="DISK") + assert len(disk_only) == 1 + assert disk_only[0].recovery_type == "DISK" + + all_recoveries = monitor.get_cluster_recovery_status(recovery_type_filter="all") + assert len(all_recoveries) == 2 + + print("βœ… Recovery type filtering tests passed") + + +def main(): + """Run all tests""" + print("πŸ§ͺ Running XMover Recovery Monitor Tests") + print("=" * 50) + + try: + test_recovery_info_parsing() + test_database_client_parsing() + test_recovery_monitor_formatting() + test_empty_recovery_handling() + test_recovery_type_filtering() + + print("\nπŸŽ‰ All tests passed successfully!") + print("\nπŸ“‹ Test Summary:") + print(" βœ… RecoveryInfo data class and properties") + print(" βœ… Database client parsing logic") + print(" βœ… Recovery monitor display formatting") + print(" βœ… Empty recovery state handling") + print(" βœ… Recovery type filtering") + + print("\nπŸš€ Recovery monitoring feature is ready for use!") + + except Exception as e: + print(f"\n❌ Test failed: {e}") + import traceback + + traceback.print_exc() + sys.exit(1)