# Performance Optimization Transmog performance can be optimized through configuration tuning, data preparation, and processing strategies. This guide covers techniques for maximizing throughput and minimizing processing time. ## Configuration Optimization ### Batch Size Tuning Batch size significantly impacts processing performance: ```python import transmog as tm import time def benchmark_batch_sizes(data, sizes=[100, 500, 1000, 2000, 5000]): """Benchmark different batch sizes.""" results = {} for size in sizes: start = time.time() result = tm.flatten(data, batch_size=size) duration = time.time() - start results[size] = duration return results # Find optimal batch size data = [{"nested": {"field": i}} for i in range(10000)] timings = benchmark_batch_sizes(data) for size, duration in timings.items(): print(f"Batch size {size}: {duration:.2f}s") ``` ### Memory vs Speed Trade-offs Balance memory usage with processing speed. The system includes adaptive memory management that automatically adjusts processing parameters based on available memory: ```python # High performance configuration fast_config = { "batch_size": 5000, "low_memory": False, "preserve_types": True } # Memory-efficient configuration (uses adaptive batch sizing) efficient_config = { "batch_size": 1000, "low_memory": True, "preserve_types": False } # Benchmark both approaches data = load_large_dataset() start = time.time() fast_result = tm.flatten(data, **fast_config) fast_time = time.time() - start start = time.time() efficient_result = tm.flatten(data, **efficient_config) efficient_time = time.time() - start print(f"Fast config: {fast_time:.2f}s") print(f"Efficient config: {efficient_time:.2f}s") ``` ## Data Preparation Optimization ### Input Data Structure Optimize input data structure for better performance: ```python # Inefficient: Deeply nested with many empty fields inefficient_data = { "level1": { "level2": { "level3": { "level4": { "value": "data", "empty1": None, "empty2": "", "empty3": [] } } } } } # Efficient: Flatter structure, minimal empty fields efficient_data = { "level1_level2_value": "data", "metadata": {"timestamp": "2024-01-01"} } # Process with optimized settings result = tm.flatten( efficient_data, skip_empty=True, skip_null=True, nested_threshold=3 ) ``` ### Array Handling Optimization Choose optimal array handling strategy: ```python data_with_arrays = { "users": [ {"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"} ] } # Fastest: Skip arrays if not needed result_skip = tm.flatten(data_with_arrays, arrays="skip") # Moderate: Inline small arrays result_inline = tm.flatten(data_with_arrays, arrays="inline") # Slowest: Separate tables (most flexible) result_separate = tm.flatten(data_with_arrays, arrays="separate") ``` ## Processing Strategies ### Parallel Processing Process multiple datasets concurrently: ```python import concurrent.futures import transmog as tm def process_file(file_path): """Process a single file.""" return tm.flatten_file( file_path, batch_size=2000, low_memory=True ) # Process multiple files in parallel file_paths = ["data1.json", "data2.json", "data3.json"] with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: futures = [executor.submit(process_file, path) for path in file_paths] results = [future.result() for future in futures] print(f"Processed {len(results)} files") ``` ### Incremental Processing Process large datasets incrementally: ```python def process_incrementally(large_dataset, chunk_size=10000): """Process dataset in chunks.""" results = [] for i in range(0, len(large_dataset), chunk_size): chunk = large_dataset[i:i + chunk_size] result = tm.flatten( chunk, name=f"chunk_{i//chunk_size}", batch_size=2000 ) results.append(result) # Optional: Clear memory between chunks del chunk return results # Process 100k records in 10k chunks large_data = generate_large_dataset(100000) chunk_results = process_incrementally(large_data) ``` ## Advanced Configuration ### Memory Optimization Features Transmog includes adaptive memory management that automatically adjusts processing parameters: ```python import transmog as tm # Enable memory tracking and adaptive sizing result = tm.flatten( large_dataset, batch_size=1000, # Starting batch size low_memory=True, # Enable memory-efficient mode # Memory optimization is automatic when low_memory=True ) # The system will: # - Monitor memory usage during processing # - Adapt batch sizes based on available memory # - Use strategic garbage collection to reduce pressure # - Apply in-place modifications to reduce allocations ``` ### Memory Optimization Strategies The memory optimization system reduces memory usage through several techniques: - **In-place modifications**: 60-70% reduction in object allocations - **Efficient path building**: 40-50% reduction in string operations - **Adaptive caching**: Memory-aware cache sizing that responds to pressure - **Strategic garbage collection**: Intelligent timing of memory cleanup ### Custom Processor Configuration Use advanced processor settings for optimal performance: ```python from transmog.process import Processor from transmog.config import TransmogConfig # Performance-optimized configuration with memory awareness config = ( TransmogConfig.performance_optimized() .with_memory_optimization( memory_tracking_enabled=True, adaptive_batch_sizing=True, memory_pressure_threshold=0.8 ) .with_processing( batch_size=5000, memory_limit="2GB" ) .with_naming( separator="_", max_depth=5 ) ) processor = Processor(config) result = processor.process(data) ``` ### Type Preservation Optimization Optimize type handling based on use case: ```python # Fast: Convert all to strings result_strings = tm.flatten( data, preserve_types=False, batch_size=5000 ) # Slower but preserves data integrity result_typed = tm.flatten( data, preserve_types=True, batch_size=3000 ) ``` ## Performance Monitoring ### Execution Time Profiling Profile processing performance: ```python import time import cProfile def profile_processing(): """Profile flatten operation.""" data = generate_test_data(10000) # Profile with cProfile profiler = cProfile.Profile() profiler.enable() result = tm.flatten(data, batch_size=2000) profiler.disable() profiler.print_stats(sort='tottime') return result # Run profiling profile_processing() ``` ### Memory Usage Monitoring Track memory consumption: ```python import psutil import os class MemoryMonitor: def __init__(self): self.process = psutil.Process(os.getpid()) self.initial_memory = self.current_memory() def current_memory(self): return self.process.memory_info().rss / 1024 / 1024 # MB def memory_used(self): return self.current_memory() - self.initial_memory # Monitor processing monitor = MemoryMonitor() result = tm.flatten( large_data, batch_size=2000, low_memory=True ) print(f"Memory used: {monitor.memory_used():.2f} MB") print(f"Records processed: {len(result.main)}") ``` ### Throughput Measurement Measure processing throughput: ```python def measure_throughput(data, config): """Measure records processed per second.""" start_time = time.time() result = tm.flatten(data, **config) end_time = time.time() duration = end_time - start_time record_count = len(result.main) throughput = record_count / duration return { "duration": duration, "records": record_count, "throughput": throughput } # Test different configurations configs = [ {"batch_size": 1000, "low_memory": True}, {"batch_size": 2000, "low_memory": True}, {"batch_size": 5000, "low_memory": False} ] data = generate_test_data(50000) for i, config in enumerate(configs): metrics = measure_throughput(data, config) print(f"Config {i+1}: {metrics['throughput']:.0f} records/sec") ``` ## Optimization Guidelines ### Choosing Optimal Settings Recommended configurations for different scenarios: ```python # High-volume, simple data high_volume_config = { "batch_size": 5000, "low_memory": False, "preserve_types": False, "skip_empty": True, "arrays": "inline" } # Complex nested data complex_data_config = { "batch_size": 2000, "low_memory": True, "preserve_types": True, "nested_threshold": 6, "arrays": "separate" } # Memory-constrained environment memory_constrained_config = { "batch_size": 500, "low_memory": True, "preserve_types": False, "skip_empty": True, "skip_null": True } ``` ### Performance Testing Framework Create systematic performance tests: ```python class PerformanceTest: def __init__(self, data_generator): self.data_generator = data_generator self.results = [] def test_config(self, config, data_size=10000): """Test a specific configuration.""" data = self.data_generator(data_size) start_time = time.time() initial_memory = self.get_memory() result = tm.flatten(data, **config) end_time = time.time() final_memory = self.get_memory() metrics = { "config": config, "duration": end_time - start_time, "memory_used": final_memory - initial_memory, "records_processed": len(result.main), "throughput": len(result.main) / (end_time - start_time) } self.results.append(metrics) return metrics def get_memory(self): process = psutil.Process(os.getpid()) return process.memory_info().rss / 1024 / 1024 def best_config(self, metric="throughput"): """Find best configuration by metric.""" return max(self.results, key=lambda x: x[metric]) # Usage def generate_nested_data(size): return [{"nested": {"field": i}} for i in range(size)] tester = PerformanceTest(generate_nested_data) # Test multiple configurations configs = [ {"batch_size": 1000}, {"batch_size": 2000}, {"batch_size": 5000} ] for config in configs: metrics = tester.test_config(config) print(f"Config {config}: {metrics['throughput']:.0f} records/sec") best = tester.best_config("throughput") print(f"Best config: {best['config']}") ``` ## Real-World Optimization Examples ### E-commerce Data Processing Optimize for typical e-commerce data structures: ```python # E-commerce product data ecommerce_data = { "products": [ { "id": "prod_123", "name": "Widget", "price": 19.99, "attributes": { "color": "red", "size": "large", "materials": ["plastic", "metal"] }, "reviews": [ {"rating": 5, "comment": "Great!"}, {"rating": 4, "comment": "Good"} ] } ] } # Optimized processing result = tm.flatten( ecommerce_data, batch_size=3000, arrays="separate", # Extract reviews as separate table id_field={"products": "id"}, # Use product ID preserve_types=True # Keep price as number ) ``` ### Log Data Processing Optimize for log file processing: ```python # Log entry structure log_entries = [ { "timestamp": "2024-01-01T12:00:00Z", "level": "INFO", "message": "User action", "context": { "user_id": "user_123", "session_id": "sess_456", "metadata": { "ip": "192.168.1.1", "user_agent": "Mozilla/5.0..." } } } ] # Optimized for log processing result = tm.flatten( log_entries, batch_size=10000, # Large batches for simple structures preserve_types=False, # Everything as strings skip_empty=True, # Remove empty fields arrays="skip", # Skip arrays in logs separator="." # Use dot notation ) ``` ### Sensor Data Processing Optimize for IoT sensor data: ```python # Sensor readings sensor_data = [ { "device_id": "sensor_001", "timestamp": "2024-01-01T12:00:00Z", "readings": { "temperature": 23.5, "humidity": 65.2, "pressure": 1013.25 }, "location": { "lat": 40.7128, "lon": -74.0060 } } ] # Optimized for numerical data result = tm.flatten( sensor_data, batch_size=5000, preserve_types=True, # Keep numerical precision arrays="inline", # Simple structure id_field="device_id", # Use device ID add_timestamp=True # Add processing timestamp ) ```