Array Handling

This guide covers Transmog’s array processing capabilities, including the three handling modes and advanced array processing scenarios.

Array Handling Overview

Transmog provides three modes for handling arrays in nested data:

Mode

Description

Use Case

"separate"

Extract arrays into child tables

Relational analysis, database storage

"inline"

Keep arrays as JSON strings in main table

Document storage, minimal processing

"skip"

Ignore arrays during processing

Focus on scalar data only

Separate Tables Mode (Default)

Basic Array Extraction

import transmog as tm

data = {
    "product": {
        "name": "Laptop",
        "tags": ["electronics", "computers", "portable"],
        "reviews": [
            {"rating": 5, "comment": "Excellent"},
            {"rating": 4, "comment": "Good value"}
        ]
    }
}

# Default behavior: arrays become separate tables
result = tm.flatten(data, name="products", arrays="separate")

print("Main table:", result.main)
# [{'product_name': 'Laptop', '_id': 'generated_id'}]

print("Tags table:", result.tables["products_tags"])
# [
#   {'value': 'electronics', '_parent_id': 'generated_id'},
#   {'value': 'computers', '_parent_id': 'generated_id'},
#   {'value': 'portable', '_parent_id': 'generated_id'}
# ]

print("Reviews table:", result.tables["products_reviews"])
# [
#   {'rating': '5', 'comment': 'Excellent', '_parent_id': 'generated_id'},
#   {'rating': '4', 'comment': 'Good value', '_parent_id': 'generated_id'}
# ]

Nested Array Processing

Arrays can contain objects with their own nested arrays:

data = {
    "company": "TechCorp",
    "departments": [
        {
            "name": "Engineering",
            "teams": [
                {"name": "Frontend", "size": 5},
                {"name": "Backend", "size": 8}
            ]
        },
        {
            "name": "Marketing",
            "teams": [
                {"name": "Digital", "size": 3}
            ]
        }
    ]
}

result = tm.flatten(data, name="company", arrays="separate")

# Multiple levels of child tables
print("Tables created:", list(result.all_tables.keys()))
# ['company', 'company_departments', 'company_departments_teams']

# Department table
print("Departments:", result.tables["company_departments"])
# [
#   {'name': 'Engineering', '_parent_id': 'main_id', '_id': 'dept_1'},
#   {'name': 'Marketing', '_parent_id': 'main_id', '_id': 'dept_2'}
# ]

# Teams table (nested array)
print("Teams:", result.tables["company_departments_teams"])
# [
#   {'name': 'Frontend', 'size': '5', '_parent_id': 'dept_1'},
#   {'name': 'Backend', 'size': '8', '_parent_id': 'dept_1'},
#   {'name': 'Digital', 'size': '3', '_parent_id': 'dept_2'}
# ]

Relationship Tracking

Parent-child relationships are preserved through ID fields:

# Build relationship map
def build_relationship_map(result):
    relationships = {}

    # Map main records
    for record in result.main:
        relationships[record["_id"]] = {
            "record": record,
            "children": {}
        }

    # Map child tables
    for table_name, records in result.tables.items():
        for record in records:
            parent_id = record["_parent_id"]
            if parent_id in relationships:
                if table_name not in relationships[parent_id]["children"]:
                    relationships[parent_id]["children"][table_name] = []
                relationships[parent_id]["children"][table_name].append(record)

    return relationships

relationships = build_relationship_map(result)

Inline Mode

JSON String Preservation

# Keep arrays as JSON strings in the main table
result = tm.flatten(data, name="products", arrays="inline")

print("Main table with inline arrays:", result.main)
# [
#   {
#     'product_name': 'Laptop',
#     'product_tags': '["electronics", "computers", "portable"]',
#     'product_reviews': '[{"rating": 5, "comment": "Excellent"}, {"rating": 4, "comment": "Good value"}]',
#     '_id': 'generated_id'
#   }
# ]

print("Child tables:", result.tables)
# {} (empty - no child tables created)

When to Use Inline Mode

Inline mode is useful when:

  • Document-oriented storage is preferred

  • Array relationships are not needed for analysis

  • Minimizing table count is important

  • Arrays will be processed by other tools

Working with Inline Arrays

import json

result = tm.flatten(data, name="products", arrays="inline")

# Parse inline arrays when needed
for record in result.main:
    if "product_tags" in record:
        tags = json.loads(record["product_tags"])
        print(f"Product tags: {tags}")

    if "product_reviews" in record:
        reviews = json.loads(record["product_reviews"])
        avg_rating = sum(r["rating"] for r in reviews) / len(reviews)
        print(f"Average rating: {avg_rating}")

Skip Mode

Ignoring Arrays

# Skip arrays entirely during processing
result = tm.flatten(data, name="products", arrays="skip")

print("Main table (arrays skipped):", result.main)
# [{'product_name': 'Laptop', '_id': 'generated_id'}]

print("Child tables:", result.tables)
# {} (empty - arrays were ignored)

When to Use Skip Mode

Skip mode is useful when:

  • Only scalar data is relevant

  • Arrays contain unstructured or irrelevant data

  • Simplifying data structure is the goal

  • Array processing will be handled separately

Advanced Array Scenarios

Mixed Array Types

data = {
    "record": {
        "scalar_values": [1, 2, 3, 4, 5],              # Simple values
        "object_array": [                               # Complex objects
            {"id": 1, "name": "Item A"},
            {"id": 2, "name": "Item B"}
        ],
        "mixed_array": [                                # Mixed types
            {"type": "object", "data": {"value": 10}},
            {"type": "string", "data": "text_value"},
            {"type": "number", "data": 42}
        ]
    }
}

result = tm.flatten(data, name="records", arrays="separate")

# All arrays are extracted consistently
print("Scalar values table:", result.tables["records_scalar_values"])
# [
#   {'value': '1', '_parent_id': 'main_id'},
#   {'value': '2', '_parent_id': 'main_id'},
#   ...
# ]

print("Object array table:", result.tables["records_object_array"])
# [
#   {'id': '1', 'name': 'Item A', '_parent_id': 'main_id'},
#   {'id': '2', 'name': 'Item B', '_parent_id': 'main_id'}
# ]

Empty and Null Arrays

data = {
    "item": {
        "name": "Product",
        "empty_array": [],
        "null_array": None,
        "valid_array": ["item1", "item2"]
    }
}

result = tm.flatten(data, name="items", arrays="separate")

# Empty and null arrays are handled gracefully
print("Tables created:", list(result.tables.keys()))
# ['items_valid_array'] (only non-empty arrays create tables)

# Control empty array handling
result = tm.flatten(data, name="items", arrays="separate", skip_empty=False)
# Now empty arrays may create empty tables

Array Field Naming

Table names for arrays follow a predictable pattern:

data = {
    "company": {
        "departments": [
            {
                "teams": [
                    {"members": ["Alice", "Bob"]}
                ]
            }
        ]
    }
}

result = tm.flatten(data, name="org", arrays="separate")

# Table naming: {entity_name}_{array_path}
print("Table names:", list(result.tables.keys()))
# [
#   'org_departments',
#   'org_departments_teams',
#   'org_departments_teams_members'
# ]

Performance Considerations

Memory Usage by Mode

# Separate mode: More tables, distributed memory usage
result_separate = tm.flatten(data, arrays="separate")
print(f"Tables: {len(result_separate.all_tables)}")

# Inline mode: Fewer tables, concentrated memory usage
result_inline = tm.flatten(data, arrays="inline")
print(f"Tables: {len(result_inline.all_tables)}")  # Usually 1

# Skip mode: Minimal memory usage
result_skip = tm.flatten(data, arrays="skip")
print(f"Tables: {len(result_skip.all_tables)}")    # Usually 1

Large Array Processing

# For large arrays, use streaming with separate mode
tm.flatten_stream(
    large_data_with_arrays,
    output_path="output/",
    name="large_dataset",
    output_format="parquet",
    arrays="separate",      # Best for large arrays
    batch_size=1000,
    low_memory=True
)

# For very large arrays that don't need analysis, use inline
result = tm.flatten(
    data_with_huge_arrays,
    name="documents",
    arrays="inline",        # Avoid creating huge child tables
    low_memory=True
)

Working with Array Results

Analyzing Array Data

def analyze_arrays(result):
    """Analyze array structure in results."""
    analysis = {}

    for table_name, records in result.tables.items():
        analysis[table_name] = {
            "record_count": len(records),
            "fields": list(records[0].keys()) if records else [],
            "parent_count": len(set(r["_parent_id"] for r in records))
        }

    return analysis

# Analyze the arrays
result = tm.flatten(complex_data, name="analysis", arrays="separate")
array_info = analyze_arrays(result)

for table, info in array_info.items():
    print(f"{table}: {info['record_count']} records, {info['parent_count']} parents")

Reconstructing Arrays

def reconstruct_arrays(result):
    """Reconstruct original array structure from separated tables."""
    # Start with main records
    reconstructed = {r["_id"]: dict(r) for r in result.main}

    # Add arrays back
    for table_name, records in result.tables.items():
        # Extract array field name from table name
        array_field = table_name.split("_", 1)[1]

        # Group records by parent
        for record in records:
            parent_id = record["_parent_id"]
            if parent_id in reconstructed:
                if array_field not in reconstructed[parent_id]:
                    reconstructed[parent_id][array_field] = []

                # Remove metadata fields for cleaner reconstruction
                clean_record = {k: v for k, v in record.items()
                              if not k.startswith("_")}
                reconstructed[parent_id][array_field].append(clean_record)

    return list(reconstructed.values())

# Reconstruct original structure
original_structure = reconstruct_arrays(result)

Configuration Patterns

Database-Optimized Arrays

# Configuration for database loading
result = tm.flatten(
    data,
    name="entities",
    arrays="separate",      # Create relational tables
    id_field="id",          # Use natural IDs
    preserve_types=False,   # Convert to strings for SQL
    skip_null=True         # Clean data for import
)

Document-Optimized Arrays

# Configuration for document storage
result = tm.flatten(
    data,
    name="documents",
    arrays="inline",        # Keep arrays as JSON
    preserve_types=True,    # Maintain type information
    skip_empty=False       # Keep structure complete
)

Analytics-Optimized Arrays

# Configuration for data analysis
result = tm.flatten(
    data,
    name="analytics",
    arrays="separate",      # Enable array analysis
    preserve_types=True,    # Keep numeric types
    add_timestamp=True,     # Add processing metadata
    skip_null=False        # Keep all data points
)

Next Steps