Skip to content

SMILES Augmentation Tutorial

This tutorial demonstrates how to use AugChem's SMILES augmentation capabilities for molecular data enhancement.

Prerequisites

pip install augchem rdkit pandas

Basic Setup

from augchem import Augmentator
import pandas as pd

# Initialize the augmentator with a seed for reproducibility
augmentator = Augmentator(seed=42)

Loading and Preparing SMILES Data

import pandas as pd

# Create a sample dataset
data = {
    'SMILES': [
        'CC(=O)O',           # Acetic acid
        'CCO',               # Ethanol  
        'C1=CC=CC=C1',       # Benzene
        'CC(C)O',            # Isopropanol
        'C1=CC=C(C=C1)O'     # Phenol
    ],
    'Property_0': [0.45, 1.23, -0.87, 0.62, -0.34]
}

df = pd.DataFrame(data)
df.to_csv('molecules.csv', index=False)
print("Sample dataset created!")

Individual SMILES Augmentation Methods

from augchem.modules.smiles.smiles_modules import (
    mask, delete, swap, fusion, enumerateSmiles, tokenize
)

original_smiles = "CC(=O)O"  # Acetic acid
print(f"Original SMILES: {original_smiles}")

# 1. Tokenization - understand SMILES structure
tokens = tokenize(original_smiles)
print(f"Tokens: {tokens}")

# 2. Masking - replace tokens with [M]
masked = mask(original_smiles, mask_ratio=0.3, seed=42)
print(f"Masked: {masked}")

# 3. Deletion - remove random tokens
deleted = delete(original_smiles, delete_ratio=0.2, seed=42)
print(f"Deleted: {deleted}")

# 4. Swapping - exchange atom positions
swapped = swap(original_smiles, seed=42)
print(f"Swapped: {swapped}")

# 5. Fusion - randomly apply mask/delete/swap
fused = fusion(original_smiles, mask_ratio=0.1, delete_ratio=0.2, seed=42)
print(f"Fused: {fused}")

# 6. Enumeration - generate non-canonical SMILES
enumerated = enumerateSmiles(original_smiles)
print(f"Enumerated: {enumerated}")

Dataset-Level SMILES Augmentation

from augchem.modules.smiles.smiles_modules import augment_dataset

# Load your dataset
df = pd.read_csv('molecules.csv')

# Apply augmentation using individual function
augmented_df = augment_dataset(
    col_to_augment="SMILES",
    dataset=df,
    augmentation_methods=["mask", "delete", "fusion", "enumeration"],
    mask_ratio=0.1,
    delete_ratio=0.3,
    augment_percentage=0.4,  # 40% more molecules
    property_col="Property_0",
    seed=42
)

print(f"Original: {len(df)} molecules")
print(f"Augmented: {len(augmented_df)} molecules")
print(f"New molecules: {len(augmented_df) - len(df)}")
# Using the main Augmentator class
augmentator = Augmentator(seed=42)

# Augment SMILES data
result = augmentator.SMILES.augment_data(
    dataset="molecules.csv",
    augmentation_methods=["fusion", "enumeration", "mask"],
    mask_ratio=0.15,
    delete_ratio=0.25,
    augment_percentage=0.3,
    col_to_augment="SMILES",
    property_col="Property_0"
)

print(f"Augmentation complete! Dataset saved as 'Augmented_molecules.csv'")
print(f"Final dataset size: {len(result)} molecules")

SMILES Augmentation Methods Summary

Method Description Parameters Best For
mask Replace tokens with '[M]' mask_ratio Language modeling
delete Remove random tokens delete_ratio Robustness testing
swap Exchange atom positions None Structural variation
fusion Random method selection mask_ratio, delete_ratio Diverse augmentation
enumeration Non-canonical SMILES None Canonical diversity

Understanding SMILES Augmentation Results

# Load and analyze augmentation results
result = pd.read_csv('Augmented_molecules.csv')

# Check original vs augmented
original_count = result['parent_idx'].isna().sum()
augmented_count = len(result) - original_count

print(f"Original molecules: {original_count}")
print(f"Augmented molecules: {augmented_count}")

# Look at augmentation examples
augmented_only = result[result['parent_idx'].notna()]
print("\nAugmentation examples:")
for i in range(min(5, len(augmented_only))):
    row = augmented_only.iloc[i]
    parent_idx = int(row['parent_idx'])
    original = result.iloc[parent_idx]['SMILES']
    augmented = row['SMILES']
    print(f"Original: {original} → Augmented: {augmented}")

SMILES Quality Control

from rdkit import Chem

def validate_smiles(smiles_list):
    """Validate SMILES strings using RDKit"""
    valid_count = 0
    invalid_smiles = []

    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            valid_count += 1
        else:
            invalid_smiles.append(smiles)

    return valid_count, invalid_smiles

# Validate augmented SMILES
smiles_list = result['SMILES'].tolist()
valid_count, invalid = validate_smiles(smiles_list)

print(f"Valid SMILES: {valid_count}/{len(smiles_list)}")
print(f"Invalid SMILES: {len(invalid)}")
if invalid:
    print("Examples of invalid SMILES:", invalid[:3])

Parameter Optimization

def find_optimal_smiles_parameters():
    """Find optimal SMILES augmentation parameters"""

    test_params = [
        {'mask_ratio': 0.1, 'delete_ratio': 0.2, 'augment_percentage': 0.2},
        {'mask_ratio': 0.15, 'delete_ratio': 0.3, 'augment_percentage': 0.3},
        {'mask_ratio': 0.2, 'delete_ratio': 0.25, 'augment_percentage': 0.4},
    ]

    results = []

    for params in test_params:
        result = augmentator.SMILES.augment_data(
            dataset="molecules.csv",
            **params,
            augmentation_methods=["fusion", "mask"]
        )

        # Calculate validity ratio
        valid_count, invalid = validate_smiles(result['SMILES'].tolist())
        valid_ratio = valid_count / len(result)

        results.append({
            **params,
            'total_molecules': len(result),
            'valid_ratio': valid_ratio
        })

    return pd.DataFrame(results)

# Run optimization
optimization_results = find_optimal_smiles_parameters()
print(optimization_results)

Large Dataset Processing

def process_large_dataset(csv_path, chunk_size=1000):
    """Process large datasets in chunks"""

    # Read dataset info
    with open(csv_path) as f:
        total_rows = sum(1 for line in f) - 1  # subtract header

    augmented_chunks = []

    for chunk_start in range(0, total_rows, chunk_size):
        # Read chunk
        chunk = pd.read_csv(csv_path, skiprows=range(1, chunk_start+1), nrows=chunk_size)

        # Augment chunk
        augmented_chunk = augmentator.SMILES.augment_data(
            dataset=chunk,
            augmentation_methods=["fusion"],
            augment_percentage=0.1
        )

        augmented_chunks.append(augmented_chunk)
        print(f"Processed chunk {chunk_start//chunk_size + 1}")

    # Combine results
    final_result = pd.concat(augmented_chunks, ignore_index=True)
    return final_result

# Example usage for large datasets
# large_result = process_large_dataset("large_molecules.csv", chunk_size=500)

Best Practices for SMILES Augmentation

1. Validation First

Always validate SMILES strings before and after augmentation:

# Validate input
valid_input = [s for s in original_smiles if validate_smiles([s])[0] > 0]
print(f"Valid input SMILES: {len(valid_input)}/{len(original_smiles)}")

2. Conservative Parameters

Start with low augmentation ratios:

# Recommended starting parameters
CONSERVATIVE_PARAMS = {
    'mask_ratio': 0.1,
    'delete_ratio': 0.2,
    'augment_percentage': 0.2
}

3. Chemical Diversity Monitoring

Check that augmentation preserves chemical diversity:

from rdkit import Chem

def check_diversity(smiles_list):
    canonical_set = set()
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            canonical = Chem.MolToSmiles(mol)
            canonical_set.add(canonical)
    return len(canonical_set)

orig_diversity = check_diversity(original_smiles)
aug_diversity = check_diversity(augmented_smiles)
print(f"Diversity preserved: {aug_diversity >= orig_diversity}")

4. Reproducible Augmentation

Always use seeds for reproducible results:

# Reproducible augmentation
EXPERIMENT_SEED = 42
augmentator = Augmentator(seed=EXPERIMENT_SEED)

augmented_1 = augmentator.SMILES.augment_data("molecules.csv", seed=EXPERIMENT_SEED)
augmented_2 = augmentator.SMILES.augment_data("molecules.csv", seed=EXPERIMENT_SEED)
# Results will be identical

Common Issues and Solutions

Invalid SMILES Generation

# Filter out invalid SMILES after augmentation
def filter_valid_smiles(df):
    valid_mask = []
    for smiles in df['SMILES']:
        mol = Chem.MolFromSmiles(smiles)
        valid_mask.append(mol is not None)

    return df[valid_mask].reset_index(drop=True)

# Apply filter
clean_result = filter_valid_smiles(result)
print(f"Filtered: {len(result)}{len(clean_result)} valid molecules")

Property Column Handling

# Ensure property columns are correctly preserved
result_with_properties = augmentator.SMILES.augment_data(
    dataset="molecules.csv",
    property_col="Property_0",  # Specify property column
    augmentation_methods=["enumeration"]  # Use safe methods
)

# Check property preservation
original_props = df['Property_0'].tolist()
augmented_props = result_with_properties['Property_0'].tolist()
print(f"Properties preserved: {len(set(original_props))} unique values")

Integration with Machine Learning

from sklearn.model_selection import train_test_split

# Prepare augmented dataset for ML
augmented_smiles = result['SMILES'].tolist()
augmented_labels = result['Property_0'].tolist()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    augmented_smiles, augmented_labels, 
    test_size=0.2, random_state=42
)

print(f"Training set: {len(X_train)} molecules")
print(f"Test set: {len(X_test)} molecules")

# Ready for molecular descriptors calculation and ML training

Next Steps

After completing this SMILES tutorial:

  • Understand all SMILES augmentation methods
  • Apply quality control and validation
  • Optimize parameters for your datasets
  • Handle large datasets efficiently
  • Integrate with ML pipelines

See Also