Array Handling¶
This guide covers Transmog’s array processing capabilities, including the three handling modes and advanced array processing scenarios.
Array Handling Overview¶
Transmog provides three modes for handling arrays in nested data:
Mode |
Description |
Use Case |
---|---|---|
|
Extract arrays into child tables |
Relational analysis, database storage |
|
Keep arrays as JSON strings in main table |
Document storage, minimal processing |
|
Ignore arrays during processing |
Focus on scalar data only |
Separate Tables Mode (Default)¶
Basic Array Extraction¶
import transmog as tm
data = {
"product": {
"name": "Laptop",
"tags": ["electronics", "computers", "portable"],
"reviews": [
{"rating": 5, "comment": "Excellent"},
{"rating": 4, "comment": "Good value"}
]
}
}
# Default behavior: arrays become separate tables
result = tm.flatten(data, name="products", arrays="separate")
print("Main table:", result.main)
# [{'product_name': 'Laptop', '_id': 'generated_id'}]
print("Tags table:", result.tables["products_tags"])
# [
# {'value': 'electronics', '_parent_id': 'generated_id'},
# {'value': 'computers', '_parent_id': 'generated_id'},
# {'value': 'portable', '_parent_id': 'generated_id'}
# ]
print("Reviews table:", result.tables["products_reviews"])
# [
# {'rating': '5', 'comment': 'Excellent', '_parent_id': 'generated_id'},
# {'rating': '4', 'comment': 'Good value', '_parent_id': 'generated_id'}
# ]
Nested Array Processing¶
Arrays can contain objects with their own nested arrays:
data = {
"company": "TechCorp",
"departments": [
{
"name": "Engineering",
"teams": [
{"name": "Frontend", "size": 5},
{"name": "Backend", "size": 8}
]
},
{
"name": "Marketing",
"teams": [
{"name": "Digital", "size": 3}
]
}
]
}
result = tm.flatten(data, name="company", arrays="separate")
# Multiple levels of child tables
print("Tables created:", list(result.all_tables.keys()))
# ['company', 'company_departments', 'company_departments_teams']
# Department table
print("Departments:", result.tables["company_departments"])
# [
# {'name': 'Engineering', '_parent_id': 'main_id', '_id': 'dept_1'},
# {'name': 'Marketing', '_parent_id': 'main_id', '_id': 'dept_2'}
# ]
# Teams table (nested array)
print("Teams:", result.tables["company_departments_teams"])
# [
# {'name': 'Frontend', 'size': '5', '_parent_id': 'dept_1'},
# {'name': 'Backend', 'size': '8', '_parent_id': 'dept_1'},
# {'name': 'Digital', 'size': '3', '_parent_id': 'dept_2'}
# ]
Relationship Tracking¶
Parent-child relationships are preserved through ID fields:
# Build relationship map
def build_relationship_map(result):
relationships = {}
# Map main records
for record in result.main:
relationships[record["_id"]] = {
"record": record,
"children": {}
}
# Map child tables
for table_name, records in result.tables.items():
for record in records:
parent_id = record["_parent_id"]
if parent_id in relationships:
if table_name not in relationships[parent_id]["children"]:
relationships[parent_id]["children"][table_name] = []
relationships[parent_id]["children"][table_name].append(record)
return relationships
relationships = build_relationship_map(result)
Inline Mode¶
JSON String Preservation¶
# Keep arrays as JSON strings in the main table
result = tm.flatten(data, name="products", arrays="inline")
print("Main table with inline arrays:", result.main)
# [
# {
# 'product_name': 'Laptop',
# 'product_tags': '["electronics", "computers", "portable"]',
# 'product_reviews': '[{"rating": 5, "comment": "Excellent"}, {"rating": 4, "comment": "Good value"}]',
# '_id': 'generated_id'
# }
# ]
print("Child tables:", result.tables)
# {} (empty - no child tables created)
When to Use Inline Mode¶
Inline mode is useful when:
Document-oriented storage is preferred
Array relationships are not needed for analysis
Minimizing table count is important
Arrays will be processed by other tools
Working with Inline Arrays¶
import json
result = tm.flatten(data, name="products", arrays="inline")
# Parse inline arrays when needed
for record in result.main:
if "product_tags" in record:
tags = json.loads(record["product_tags"])
print(f"Product tags: {tags}")
if "product_reviews" in record:
reviews = json.loads(record["product_reviews"])
avg_rating = sum(r["rating"] for r in reviews) / len(reviews)
print(f"Average rating: {avg_rating}")
Skip Mode¶
Ignoring Arrays¶
# Skip arrays entirely during processing
result = tm.flatten(data, name="products", arrays="skip")
print("Main table (arrays skipped):", result.main)
# [{'product_name': 'Laptop', '_id': 'generated_id'}]
print("Child tables:", result.tables)
# {} (empty - arrays were ignored)
When to Use Skip Mode¶
Skip mode is useful when:
Only scalar data is relevant
Arrays contain unstructured or irrelevant data
Simplifying data structure is the goal
Array processing will be handled separately
Advanced Array Scenarios¶
Mixed Array Types¶
data = {
"record": {
"scalar_values": [1, 2, 3, 4, 5], # Simple values
"object_array": [ # Complex objects
{"id": 1, "name": "Item A"},
{"id": 2, "name": "Item B"}
],
"mixed_array": [ # Mixed types
{"type": "object", "data": {"value": 10}},
{"type": "string", "data": "text_value"},
{"type": "number", "data": 42}
]
}
}
result = tm.flatten(data, name="records", arrays="separate")
# All arrays are extracted consistently
print("Scalar values table:", result.tables["records_scalar_values"])
# [
# {'value': '1', '_parent_id': 'main_id'},
# {'value': '2', '_parent_id': 'main_id'},
# ...
# ]
print("Object array table:", result.tables["records_object_array"])
# [
# {'id': '1', 'name': 'Item A', '_parent_id': 'main_id'},
# {'id': '2', 'name': 'Item B', '_parent_id': 'main_id'}
# ]
Empty and Null Arrays¶
data = {
"item": {
"name": "Product",
"empty_array": [],
"null_array": None,
"valid_array": ["item1", "item2"]
}
}
result = tm.flatten(data, name="items", arrays="separate")
# Empty and null arrays are handled gracefully
print("Tables created:", list(result.tables.keys()))
# ['items_valid_array'] (only non-empty arrays create tables)
# Control empty array handling
result = tm.flatten(data, name="items", arrays="separate", skip_empty=False)
# Now empty arrays may create empty tables
Array Field Naming¶
Table names for arrays follow a predictable pattern:
data = {
"company": {
"departments": [
{
"teams": [
{"members": ["Alice", "Bob"]}
]
}
]
}
}
result = tm.flatten(data, name="org", arrays="separate")
# Table naming: {entity_name}_{array_path}
print("Table names:", list(result.tables.keys()))
# [
# 'org_departments',
# 'org_departments_teams',
# 'org_departments_teams_members'
# ]
Performance Considerations¶
Memory Usage by Mode¶
# Separate mode: More tables, distributed memory usage
result_separate = tm.flatten(data, arrays="separate")
print(f"Tables: {len(result_separate.all_tables)}")
# Inline mode: Fewer tables, concentrated memory usage
result_inline = tm.flatten(data, arrays="inline")
print(f"Tables: {len(result_inline.all_tables)}") # Usually 1
# Skip mode: Minimal memory usage
result_skip = tm.flatten(data, arrays="skip")
print(f"Tables: {len(result_skip.all_tables)}") # Usually 1
Large Array Processing¶
# For large arrays, use streaming with separate mode
tm.flatten_stream(
large_data_with_arrays,
output_path="output/",
name="large_dataset",
output_format="parquet",
arrays="separate", # Best for large arrays
batch_size=1000,
low_memory=True
)
# For very large arrays that don't need analysis, use inline
result = tm.flatten(
data_with_huge_arrays,
name="documents",
arrays="inline", # Avoid creating huge child tables
low_memory=True
)
Working with Array Results¶
Analyzing Array Data¶
def analyze_arrays(result):
"""Analyze array structure in results."""
analysis = {}
for table_name, records in result.tables.items():
analysis[table_name] = {
"record_count": len(records),
"fields": list(records[0].keys()) if records else [],
"parent_count": len(set(r["_parent_id"] for r in records))
}
return analysis
# Analyze the arrays
result = tm.flatten(complex_data, name="analysis", arrays="separate")
array_info = analyze_arrays(result)
for table, info in array_info.items():
print(f"{table}: {info['record_count']} records, {info['parent_count']} parents")
Reconstructing Arrays¶
def reconstruct_arrays(result):
"""Reconstruct original array structure from separated tables."""
# Start with main records
reconstructed = {r["_id"]: dict(r) for r in result.main}
# Add arrays back
for table_name, records in result.tables.items():
# Extract array field name from table name
array_field = table_name.split("_", 1)[1]
# Group records by parent
for record in records:
parent_id = record["_parent_id"]
if parent_id in reconstructed:
if array_field not in reconstructed[parent_id]:
reconstructed[parent_id][array_field] = []
# Remove metadata fields for cleaner reconstruction
clean_record = {k: v for k, v in record.items()
if not k.startswith("_")}
reconstructed[parent_id][array_field].append(clean_record)
return list(reconstructed.values())
# Reconstruct original structure
original_structure = reconstruct_arrays(result)
Configuration Patterns¶
Database-Optimized Arrays¶
# Configuration for database loading
result = tm.flatten(
data,
name="entities",
arrays="separate", # Create relational tables
id_field="id", # Use natural IDs
preserve_types=False, # Convert to strings for SQL
skip_null=True # Clean data for import
)
Document-Optimized Arrays¶
# Configuration for document storage
result = tm.flatten(
data,
name="documents",
arrays="inline", # Keep arrays as JSON
preserve_types=True, # Maintain type information
skip_empty=False # Keep structure complete
)
Analytics-Optimized Arrays¶
# Configuration for data analysis
result = tm.flatten(
data,
name="analytics",
arrays="separate", # Enable array analysis
preserve_types=True, # Keep numeric types
add_timestamp=True, # Add processing metadata
skip_null=False # Keep all data points
)
Next Steps¶
ID Management - Control record identification
Output Formats - Choose optimal output formats for array data
Performance Guide - Optimize array processing performance