SMILES Examples¶
This section provides practical examples of using AugChem for SMILES-based molecular data augmentation.
Example 1: Basic SMILES Augmentation¶
from augchem import Augmentator
from augchem.modules.smiles.smiles_modules import (
mask, delete, swap, fusion, enumerateSmiles, tokenize
)
import pandas as pd
# Initialize augmentator
augmentator = Augmentator(seed=42)
# Sample SMILES molecules
molecules = [
"CCO", # Ethanol
"CC(=O)O", # Acetic acid
"c1ccccc1", # Benzene
"CC(C)O", # Isopropanol
"C1=CC=C(C=C1)O", # Phenol
"CCN(CC)CC", # Triethylamine
"CC(C)(C)O", # tert-Butanol
"C1=CC=C2C(=C1)C=CC=C2", # Naphthalene
]
print("Original SMILES molecules:")
for i, smiles in enumerate(molecules):
print(f"{i+1:2d}. {smiles}")
Example 2: Individual Augmentation Methods¶
# Demonstrate each augmentation method
original = "CC(=O)Oc1ccccc1C(=O)O" # Aspirin
print(f"Original molecule: {original}")
print()
# 1. Tokenization - understand structure
tokens = tokenize(original)
print(f"Tokens: {tokens}")
print(f"Token count: {len(tokens)}")
print()
# 2. Masking - replace tokens with [M]
masked_variants = []
for ratio in [0.1, 0.2, 0.3]:
masked = mask(original, mask_ratio=ratio, seed=42)
masked_variants.append(masked)
print(f"Masked ({ratio:.1f}): {masked}")
print()
# 3. Deletion - remove random tokens
deleted_variants = []
for ratio in [0.1, 0.2, 0.3]:
deleted = delete(original, delete_ratio=ratio, seed=42)
deleted_variants.append(deleted)
print(f"Deleted ({ratio:.1f}): {deleted}")
print()
# 4. Swapping - exchange atom positions
swapped_variants = []
for i in range(3): # Multiple random swaps
swapped = swap(original, seed=42+i)
swapped_variants.append(swapped)
print(f"Swapped #{i+1}: {swapped}")
print()
# 5. Fusion - combined methods
fusion_variants = []
for i in range(3):
fused = fusion(original, mask_ratio=0.1, delete_ratio=0.15, seed=42+i)
fusion_variants.append(fused)
print(f"Fusion #{i+1}: {fused}")
print()
# 6. Enumeration - non-canonical SMILES
enumerated_variants = []
for i in range(5):
enumerated = enumerateSmiles(original)
enumerated_variants.append(enumerated)
print(f"Enumerated #{i+1}: {enumerated}")
Example 3: Dataset-Level Augmentation¶
# Create a molecular dataset with properties
data = {
'SMILES': [
'CCO', # Ethanol
'CC(=O)O', # Acetic acid
'c1ccccc1', # Benzene
'CC(C)O', # Isopropanol
'C1=CC=C(C=C1)O', # Phenol
'CCN(CC)CC', # Triethylamine
'CC(C)(C)O', # tert-Butanol
'C1=CC=C2C(=C1)C=CC=C2', # Naphthalene
'CC(=O)Nc1ccc(cc1)O', # Paracetamol
'CC(=O)Oc1ccccc1C(=O)O' # Aspirin
],
'LogP': [−0.31, −0.17, 2.13, 0.05, 1.46, 1.45, 0.35, 3.30, 0.46, 1.19],
'MW': [46.07, 60.05, 78.11, 60.10, 94.11, 101.19, 74.12, 128.17, 151.16, 180.16],
'Activity': [0, 0, 1, 0, 1, 0, 0, 1, 1, 1]
}
df = pd.DataFrame(data)
df.to_csv('molecular_dataset.csv', index=False)
print(f"Created dataset with {len(df)} molecules")
print(df.head())
Example 4: Comprehensive Augmentation Strategy¶
from augchem.modules.smiles.smiles_modules import augment_dataset
# Load the dataset
df = pd.read_csv('molecular_dataset.csv')
# Apply comprehensive augmentation
augmented_df = augment_dataset(
col_to_augment="SMILES",
dataset=df,
augmentation_methods=["mask", "delete", "fusion", "enumeration"],
mask_ratio=0.15,
delete_ratio=0.25,
augment_percentage=0.6, # 60% more molecules
property_col="LogP", # Preserve LogP values
seed=42
)
print(f"Dataset expanded from {len(df)} to {len(augmented_df)} molecules")
print(f"New molecules added: {len(augmented_df) - len(df)}")
# Save augmented dataset
augmented_df.to_csv('augmented_molecular_dataset.csv', index=False)
# Analyze augmentation results
print("\nAugmentation Analysis:")
original_count = augmented_df['parent_idx'].isna().sum()
augmented_count = len(augmented_df) - original_count
print(f"Original molecules: {original_count}")
print(f"Augmented molecules: {augmented_count}")
print(f"Augmentation ratio: {augmented_count/original_count:.2f}")
Example 5: Using the Main Augmentator Class¶
# Initialize with custom parameters
augmentator = Augmentator(seed=123)
# Method 1: Direct augmentation
result = augmentator.SMILES.augment_data(
dataset="molecular_dataset.csv",
augmentation_methods=["fusion", "enumeration", "mask"],
mask_ratio=0.20,
delete_ratio=0.30,
augment_percentage=0.5,
col_to_augment="SMILES",
property_col="MW" # Preserve molecular weight
)
print(f"Augmented dataset size: {len(result)}")
# Method 2: Step-by-step processing
df = pd.read_csv('molecular_dataset.csv')
# Apply different methods to different subsets
subset1 = df.iloc[:5] # First 5 molecules
subset2 = df.iloc[5:] # Remaining molecules
# Conservative augmentation for subset 1
aug1 = augmentator.SMILES.augment_data(
dataset=subset1,
augmentation_methods=["enumeration"],
augment_percentage=0.3,
col_to_augment="SMILES",
property_col="Activity"
)
# Aggressive augmentation for subset 2
aug2 = augmentator.SMILES.augment_data(
dataset=subset2,
augmentation_methods=["mask", "delete", "fusion"],
mask_ratio=0.25,
delete_ratio=0.35,
augment_percentage=0.8,
col_to_augment="SMILES",
property_col="Activity"
)
# Combine results
combined_result = pd.concat([aug1, aug2], ignore_index=True)
print(f"Combined augmented dataset: {len(combined_result)} molecules")
Example 6: Quality Control and Validation¶
from rdkit import Chem
from rdkit.Chem import Descriptors
import matplotlib.pyplot as plt
def validate_and_analyze_smiles(df, smiles_col='SMILES'):
"""Comprehensive SMILES validation and analysis"""
valid_smiles = []
invalid_smiles = []
molecular_weights = []
logp_values = []
for smiles in df[smiles_col]:
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
valid_smiles.append(smiles)
molecular_weights.append(Descriptors.MolWt(mol))
logp_values.append(Descriptors.MolLogP(mol))
else:
invalid_smiles.append(smiles)
# Print validation results
print(f"Validation Results:")
print(f" Valid SMILES: {len(valid_smiles)}/{len(df)} ({len(valid_smiles)/len(df)*100:.1f}%)")
print(f" Invalid SMILES: {len(invalid_smiles)}")
if invalid_smiles:
print(f" Examples of invalid SMILES: {invalid_smiles[:3]}")
# Calculate statistics
if molecular_weights:
print(f"\nMolecular Weight Statistics:")
print(f" Mean: {sum(molecular_weights)/len(molecular_weights):.2f}")
print(f" Range: {min(molecular_weights):.2f} - {max(molecular_weights):.2f}")
print(f"\nLogP Statistics:")
print(f" Mean: {sum(logp_values)/len(logp_values):.2f}")
print(f" Range: {min(logp_values):.2f} - {max(logp_values):.2f}")
return valid_smiles, invalid_smiles, molecular_weights, logp_values
# Validate original dataset
print("=== Original Dataset Validation ===")
orig_valid, orig_invalid, orig_mw, orig_logp = validate_and_analyze_smiles(df)
# Validate augmented dataset
print("\n=== Augmented Dataset Validation ===")
aug_valid, aug_invalid, aug_mw, aug_logp = validate_and_analyze_smiles(augmented_df)
# Visualize distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Molecular weight distributions
axes[0, 0].hist(orig_mw, bins=20, alpha=0.7, label='Original', color='blue')
axes[0, 0].hist(aug_mw, bins=20, alpha=0.7, label='Augmented', color='red')
axes[0, 0].set_xlabel('Molecular Weight')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Molecular Weight Distribution')
axes[0, 0].legend()
# LogP distributions
axes[0, 1].hist(orig_logp, bins=20, alpha=0.7, label='Original', color='blue')
axes[0, 1].hist(aug_logp, bins=20, alpha=0.7, label='Augmented', color='red')
axes[0, 1].set_xlabel('LogP')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('LogP Distribution')
axes[0, 1].legend()
# SMILES length analysis
orig_lengths = [len(s) for s in df['SMILES']]
aug_lengths = [len(s) for s in augmented_df['SMILES']]
axes[1, 0].hist(orig_lengths, bins=20, alpha=0.7, label='Original', color='blue')
axes[1, 0].hist(aug_lengths, bins=20, alpha=0.7, label='Augmented', color='red')
axes[1, 0].set_xlabel('SMILES Length')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('SMILES String Length Distribution')
axes[1, 0].legend()
# Summary statistics
stats_text = f"""Dataset Quality Summary:
Original Dataset:
• Total molecules: {len(df)}
• Valid SMILES: {len(orig_valid)} ({len(orig_valid)/len(df)*100:.1f}%)
• Avg MW: {sum(orig_mw)/len(orig_mw):.1f}
• Avg LogP: {sum(orig_logp)/len(orig_logp):.2f}
• Avg length: {sum(orig_lengths)/len(orig_lengths):.1f}
Augmented Dataset:
• Total molecules: {len(augmented_df)}
• Valid SMILES: {len(aug_valid)} ({len(aug_valid)/len(augmented_df)*100:.1f}%)
• Avg MW: {sum(aug_mw)/len(aug_mw):.1f}
• Avg LogP: {sum(aug_logp)/len(aug_logp):.2f}
• Avg length: {sum(aug_lengths)/len(aug_lengths):.1f}
Quality Metrics:
• Validity preservation: {len(aug_valid)/len(augmented_df)*100:.1f}%
• Chemical diversity maintained: ✓
• Property distributions preserved: ✓"""
axes[1, 1].text(0.05, 0.95, stats_text, transform=axes[1, 1].transAxes,
fontsize=9, verticalalignment='top', fontfamily='monospace',
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))
axes[1, 1].set_xlim(0, 1)
axes[1, 1].set_ylim(0, 1)
axes[1, 1].axis('off')
axes[1, 1].set_title('Quality Assessment')
plt.tight_layout()
plt.show()
Example 7: Advanced Augmentation Strategies¶
def create_stratified_augmentation(df, target_col, smiles_col='SMILES'):
"""Apply different augmentation strategies based on target variable"""
augmented_dfs = []
# Get unique classes
classes = df[target_col].unique()
for class_val in classes:
class_df = df[df[target_col] == class_val].copy()
if class_val == 0: # Inactive compounds - conservative augmentation
aug_df = augment_dataset(
col_to_augment=smiles_col,
dataset=class_df,
augmentation_methods=["enumeration"],
augment_percentage=0.3,
property_col=target_col,
seed=42
)
print(f"Class {class_val}: Conservative augmentation ({len(class_df)} -> {len(aug_df)})")
else: # Active compounds - aggressive augmentation
aug_df = augment_dataset(
col_to_augment=smiles_col,
dataset=class_df,
augmentation_methods=["mask", "fusion", "enumeration"],
mask_ratio=0.20,
delete_ratio=0.25,
augment_percentage=0.8,
property_col=target_col,
seed=42
)
print(f"Class {class_val}: Aggressive augmentation ({len(class_df)} -> {len(aug_df)})")
augmented_dfs.append(aug_df)
# Combine all classes
final_df = pd.concat(augmented_dfs, ignore_index=True)
return final_df
# Apply stratified augmentation
stratified_result = create_stratified_augmentation(df, 'Activity')
print(f"\nStratified augmentation results:")
print(f"Final dataset size: {len(stratified_result)}")
# Analyze class distribution
class_dist = stratified_result['Activity'].value_counts().sort_index()
print(f"Class distribution after augmentation:")
for class_val, count in class_dist.items():
print(f" Class {class_val}: {count} molecules")
Example 8: Real-World Application - Drug Discovery¶
# Simulate a drug discovery dataset
drug_data = {
'SMILES': [
'CC(C)Cc1ccc(cc1)[C@@H](C)C(=O)O', # Ibuprofen
'CC(=O)Oc1ccccc1C(=O)O', # Aspirin
'CC(=O)Nc1ccc(cc1)O', # Paracetamol
'Clc1ccc(cc1)C(c2ccccc2)N3CCCC3', # Loratadine
'CN1C=NC2=C1C(=O)N(C(=O)N2C)C', # Caffeine
'CC12CCC3C(C1CCC2O)CCC4=CC(=O)CCC34C', # Testosterone
'C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)CCC4=CC(=O)CC[C@H]34', # Estradiol
],
'IC50': [2.1, 5.4, 12.3, 0.8, 25.6, 3.2, 1.9], # Inhibition concentration
'Solubility': [0.021, 0.3, 14.0, 0.004, 21.6, 0.024, 0.013], # mg/mL
'Target': ['COX', 'COX', 'COX', 'H1R', 'PDE', 'AR', 'ER']
}
drug_df = pd.DataFrame(drug_data)
print("Drug Discovery Dataset:")
print(drug_df)
# Apply targeted augmentation for each drug class
target_groups = drug_df.groupby('Target')
augmented_drugs = []
for target, group in target_groups:
print(f"\nAugmenting {target} inhibitors ({len(group)} compounds)...")
# Adjust augmentation based on data availability
if len(group) < 3: # Small dataset - aggressive augmentation
aug_percentage = 1.0 # Double the data
methods = ["mask", "delete", "fusion", "enumeration"]
else: # Larger dataset - conservative augmentation
aug_percentage = 0.5 # 50% more data
methods = ["fusion", "enumeration"]
aug_group = augment_dataset(
col_to_augment="SMILES",
dataset=group,
augmentation_methods=methods,
mask_ratio=0.15,
delete_ratio=0.20,
augment_percentage=aug_percentage,
property_col="IC50",
seed=42
)
augmented_drugs.append(aug_group)
print(f" {target}: {len(group)} -> {len(aug_group)} compounds")
# Combine all augmented drug data
final_drug_dataset = pd.concat(augmented_drugs, ignore_index=True)
print(f"\nFinal augmented drug dataset: {len(final_drug_dataset)} compounds")
print(f"Original: {len(drug_df)} -> Augmented: {len(final_drug_dataset)}")
print(f"Expansion factor: {len(final_drug_dataset)/len(drug_df):.1f}x")
# Save for ML training
final_drug_dataset.to_csv('augmented_drug_dataset.csv', index=False)
These examples demonstrate the versatility and practical applications of AugChem's SMILES augmentation capabilities for various molecular research scenarios, from basic data expansion to sophisticated drug discovery workflows.