Computing SOAP features¶

This examples shows how to compute the SOAP power spectrum descriptor for each atom in each system of a provided systems file. The path to the systems file is taken from the first command line argument.

In the end, the descriptor is transformed in a way compatible with how most classic machine learning (such as PCA or linear regression) work.

The workflow is the same for every provided descriptor. Take a look at the Reference guides for a list with all descriptors and their specific parameters.

You can obtain a testing dataset from our website.

import chemfiles

from featomic import SoapPowerSpectrum

Read systems using chemfiles. You can obtain the dataset used in this example from our website.

with chemfiles.Trajectory("dataset.xyz") as trajectory:
    systems = [s for s in trajectory]

Featomic can also handles systems read by ASE using

systems = ase.io.read("dataset.xyz", ":").

We can now define hyper parameters for the calculation

HYPER_PARAMETERS = {
    "cutoff": {
        "radius": 5.0,
        "smoothing": {"type": "ShiftedCosine", "width": 0.5},
    },
    "density": {
        "type": "Gaussian",
        "width": 0.3,
    },
    "basis": {
        "type": "TensorProduct",
        "max_angular": 4,
        "radial": {"type": "Gto", "max_radial": 6},
    },
}

calculator = SoapPowerSpectrum(**HYPER_PARAMETERS)

And then run the actual calculation, including gradients with respect to positions

descriptor = calculator.compute(systems, gradients=["positions"])

The descriptor is a metatensor TensorMap, containing multiple blocks. We can transform it to a single block containing a dense representation, with one sample for each atom-centered environment by using keys_to_samples and keys_to_properties

print("before: ", len(descriptor.keys))

descriptor = descriptor.keys_to_samples("center_type")
descriptor = descriptor.keys_to_properties(["neighbor_1_type", "neighbor_2_type"])
print("after: ", len(descriptor.keys))

before:  40
after:  1

you can now use descriptor.block().values as the input of a machine learning algorithm

print(descriptor.block().values.shape)

(1380, 2450)

use metatensor::Labels;
use featomic::{Calculator, System, CalculationOptions};
use chemfiles::{Trajectory, Frame};

fn read_systems_from_file(path: &str) -> Vec<Box<dyn System>> {
    let mut trajectory = Trajectory::open(path, 'r').expect("could not open the trajectory");
    let mut frame = Frame::new();
    let mut systems = Vec::new();
    for step in 0..trajectory.nsteps() {
        trajectory.read_step(step, &mut frame).expect("failed to read single frame");
        systems.push((&frame).into());
    }

    systems
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    // load the systems from command line argument
    let path = std::env::args().nth(1).expect("expected a command line argument");
    let mut systems = read_systems_from_file(&path);

    // pass hyper-parameters as JSON
    let parameters = r#"{
        "cutoff": {
            "radius": 5.0,
            "smoothing": {
                "type": "ShiftedCosine",
                "width": 0.5
            }
        },
        "density": {
            "type": "Gaussian",
            "width": 0.3
        },
        "basis": {
            "type": "TensorProduct",
            "max_angular": 4,
            "radial": {"type": "Gto", "max_radial": 6}
        }
    }"#;
    // create the calculator with its name and parameters
    let mut calculator = Calculator::new("soap_power_spectrum", parameters.to_owned())?;

    // create the options for a single calculation, here we request the
    // calculation of gradients with respect to positions
    let options = CalculationOptions {
        gradients: &["positions"],
        ..Default::default()
    };

    // run the calculation
    let descriptor = calculator.compute(&mut systems, options)?;

    // Transform the descriptor to dense representation, with one sample for
    // each atom-centered environment, and the neighbor atomic types part of the
    // properties
    let keys_to_move = Labels::empty(vec!["center_type"]);
    let descriptor = descriptor.keys_to_samples(&keys_to_move, /* sort_samples */ true)?;

    let keys_to_move = Labels::empty(vec!["neighbor_1_type", "neighbor_2_type"]);
    let descriptor = descriptor.keys_to_properties(&keys_to_move, /* sort_samples */ true)?;

    // descriptor now contains a single block, which can be used as the input
    // to standard ML algorithms
    let values = descriptor.block_by_id(0).values().to_array();
    println!("SOAP representation shape: {:?}", values.shape());

    Ok(())
}

#include <iostream>
#include <chemfiles.hpp>
#include <featomic.hpp>

static std::vector<featomic::SimpleSystem> read_systems(const std::string& path);

int main(int argc, char* argv[]) {
    if (argc < 2) {
        std::cout << "error: expected a command line argument" << std::endl;
        return 1;
    }
    auto systems = read_systems(argv[1]);

    // pass hyper-parameters as JSON
    const char* parameters = R"({
        "cutoff": {
            "radius": 5.0,
            "smoothing": {"type": "ShiftedCosine", "width": 0.5}
        },
        "density": {
            "type": "Gaussian",
            "width": 0.3
        },
        "basis": {
            "type": "TensorProduct",
            "max_angular": 6,
            "radial": {"type": "Gto", "max_radial": 6}
        }
    })";

    // create the calculator with its name and parameters
    auto calculator = featomic::Calculator("soap_power_spectrum", parameters);

    // setup the calculation options
    auto options = featomic::CalculationOptions();
    options.use_native_system = true;
    options.gradients.push_back("positions");

    // run the calculation
    auto descriptor = calculator.compute(systems, options);

    // The descriptor is a metatensor `TensorMap`, containing multiple blocks.
    // We can transform it to a single block containing a dense representation,
    // with one sample for each atom-centered environment.
    descriptor.keys_to_samples("center_type");
    descriptor.keys_to_properties(std::vector<std::string>{"neighbor_1_type", "neighbor_2_type"});

    // extract values from the descriptor in the only remaining block
    auto block = descriptor.block_by_id(0);
    auto values = block.values();

    // you can now use values as the input of a machine learning algorithm

    return 0;
}


std::vector<featomic::SimpleSystem> read_systems(const std::string& path) {
    auto trajectory = chemfiles::Trajectory(path);
    auto systems = std::vector<featomic::SimpleSystem>();
    for (size_t step=0; step<trajectory.size(); step++) {
        auto frame = trajectory.read_at(step);

        auto matrix = frame.cell().matrix();
        auto system = featomic::SimpleSystem(featomic::System::CellMatrix{{
            {matrix[0][0], matrix[0][1], matrix[0][2]},
            {matrix[1][0], matrix[1][1], matrix[1][2]},
            {matrix[2][0], matrix[2][1], matrix[2][2]},
        }});

        auto positions = frame.positions();
        for (size_t i=0; i<frame.size(); i++) {
            system.add_atom(
                static_cast<int32_t>(frame[i].atomic_number().value_or(0)),
                {positions[i][0], positions[i][1], positions[i][2]}
            );
        }

        systems.push_back(system);
    }

    return systems;
}

#include <assert.h>
#include <stdbool.h>
#include <stdio.h>

#include <featomic.h>
#include <metatensor.h>

#include "common/systems.h"

static mts_tensormap_t* move_keys_to_samples(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len);
static mts_tensormap_t* move_keys_to_properties(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len);

int main(int argc, char* argv[]) {
    int status = FEATOMIC_SUCCESS;
    featomic_calculator_t* calculator = NULL;
    featomic_system_t* systems = NULL;
    uintptr_t n_systems = 0;
    double* values = NULL;
    const uintptr_t* shape = NULL;
    uintptr_t shape_count = 0;
    bool got_error = true;
    const char* keys_to_samples[] = {"center_type"};
    const char* keys_to_properties[] = {"neighbor_1_type", "neighbor_2_type"};
    // use the default set of options, computing all samples and all features,
    // and including gradients with respect to positions
    featomic_calculation_options_t options = {0};
    const char* gradients_list[] = {"positions"};
    options.gradients = gradients_list;
    options.gradients_count = 1;
    options.use_native_system = true;

    mts_tensormap_t* descriptor = NULL;
    mts_block_t* block = NULL;
    mts_array_t array = {0};

    // hyper-parameters for the calculation as JSON
    const char* parameters = "{\n"
        "\"cutoff\": {\n"
        "    \"radius\": 5.0,\n"
        "    \"smoothing\": {\"type\": \"ShiftedCosine\", \"width\": 0.5}\n"
        "},\n"
        "\"density\": {\n"
        "    \"type\": \"Gaussian\",\n"
        "    \"width\": 0.3\n"
        "},\n"
        "\"basis\": {\n"
        "    \"type\": \"TensorProduct\",\n"
        "    \"max_angular\": 6,\n"
        "    \"radial\": {\"type\": \"Gto\", \"max_radial\": 6}\n"
        "}\n"
    "}";

    // load systems from command line arguments
    if (argc < 2) {
        printf("error: expected a command line argument");
        goto cleanup;
    }
    status = read_systems_example(argv[1], &systems, &n_systems);
    if (status != 0) {
        goto cleanup;
    }

    // create the calculator with its name and parameters
    calculator = featomic_calculator("soap_power_spectrum", parameters);
    if (calculator == NULL) {
        printf("Error: %s\n", featomic_last_error());
        goto cleanup;
    }

    // run the calculation
    status = featomic_calculator_compute(
        calculator, &descriptor, systems, n_systems, options
    );
    if (status != FEATOMIC_SUCCESS) {
        printf("Error: %s\n", featomic_last_error());
        goto cleanup;
    }

    // The descriptor is a metatensor `TensorMap`, containing multiple blocks.
    // We can transform it to a single block containing a dense representation,
    // with one sample for each atom-centered environment.
    descriptor = move_keys_to_samples(descriptor, keys_to_samples, 1);
    if (descriptor == NULL) {
        printf("Error: %s\n", mts_last_error());
        goto cleanup;
    }

    descriptor = move_keys_to_properties(descriptor, keys_to_properties, 2);
    if (descriptor == NULL) {
        printf("Error: %s\n", mts_last_error());
        goto cleanup;
    }

    // extract the unique block and corresponding values from the descriptor
    status = mts_tensormap_block_by_id(descriptor, &block, 0);
    if (status != MTS_SUCCESS) {
        printf("Error: %s\n", mts_last_error());
        goto cleanup;
    }

    status = mts_block_data(block, &array);
    if (status != MTS_SUCCESS) {
        printf("Error: %s\n", mts_last_error());
        goto cleanup;
    }

    // callback the functions on the mts_array_t to extract the shape/data pointer
    status = array.shape(array.ptr, &shape, &shape_count);
    if (status != MTS_SUCCESS) {
        printf("Error: %s\n", mts_last_error());
        goto cleanup;
    }

    status = array.data(array.ptr, &values);
    if (status != MTS_SUCCESS) {
        printf("Error: %s\n", mts_last_error());
        goto cleanup;
    }

    if (status != MTS_SUCCESS) {
        printf("Error: %s\n", mts_last_error());
        goto cleanup;
    }
    assert(shape_count == 2);

    // you can now use `values` as the input of a machine learning algorithm
    printf("the value array shape is %lu x %lu\n", shape[0], shape[1]);

    got_error = false;
cleanup:
    mts_tensormap_free(descriptor);
    featomic_calculator_free(calculator);

    free_systems_example(systems, n_systems);

    if (got_error) {
        return 1;
    } else {
        return 0;
    }
}


mts_tensormap_t* move_keys_to_samples(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len) {
    mts_labels_t keys = {0};
    mts_tensormap_t* moved_descriptor = NULL;

    keys.names = keys_to_move;
    keys.size = keys_to_move_len;
    keys.values = NULL;
    keys.count = 0;

    moved_descriptor = mts_tensormap_keys_to_samples(descriptor, keys, true);
    mts_tensormap_free(descriptor);

    return moved_descriptor;
}


mts_tensormap_t* move_keys_to_properties(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len) {
    mts_labels_t keys = {0};
    mts_tensormap_t* moved_descriptor = NULL;

    keys.names = keys_to_move;
    keys.size = keys_to_move_len;
    keys.values = NULL;
    keys.count = 0;

    moved_descriptor = mts_tensormap_keys_to_properties(descriptor, keys, true);
    mts_tensormap_free(descriptor);

    return moved_descriptor;
}

#include "common/systems.c"