Computing SOAP featuresΒΆ
This examples shows how to compute the SOAP power spectrum descriptor for each atom in each system of a provided systems file. The path to the systems file is taken from the first command line argument.
In the end, the descriptor is transformed in a way compatible with how most classic machine learning (such as PCA or linear regression) work.
The workflow is the same for every provided descriptor. Take a look at the Reference guides for a list with all descriptors and their specific parameters.
You can obtain a testing dataset from our website
.
import chemfiles
from featomic import SoapPowerSpectrum
Read systems using chemfiles. You can obtain the dataset used in this
example from our website
.
with chemfiles.Trajectory("dataset.xyz") as trajectory:
systems = [s for s in trajectory]
Featomic can also handles systems read by ASE using
systems = ase.io.read("dataset.xyz", ":")
.
We can now define hyper parameters for the calculation
HYPER_PARAMETERS = {
"cutoff": {
"radius": 5.0,
"smoothing": {"type": "ShiftedCosine", "width": 0.5},
},
"density": {
"type": "Gaussian",
"width": 0.3,
},
"basis": {
"type": "TensorProduct",
"max_angular": 4,
"radial": {"type": "Gto", "max_radial": 6},
},
}
calculator = SoapPowerSpectrum(**HYPER_PARAMETERS)
And then run the actual calculation, including gradients with respect to positions
descriptor = calculator.compute(systems, gradients=["positions"])
The descriptor is a metatensor TensorMap
, containing multiple blocks. We
can transform it to a single block containing a dense representation, with one
sample for each atom-centered environment by using keys_to_samples
and
keys_to_properties
print("before: ", len(descriptor.keys))
descriptor = descriptor.keys_to_samples("center_type")
descriptor = descriptor.keys_to_properties(["neighbor_1_type", "neighbor_2_type"])
print("after: ", len(descriptor.keys))
before: 40
after: 1
you can now use descriptor.block().values
as the input of a machine
learning algorithm
print(descriptor.block().values.shape)
(1380, 2450)
use metatensor::Labels;
use featomic::{Calculator, System, CalculationOptions};
use chemfiles::{Trajectory, Frame};
fn read_systems_from_file(path: &str) -> Vec<Box<dyn System>> {
let mut trajectory = Trajectory::open(path, 'r').expect("could not open the trajectory");
let mut frame = Frame::new();
let mut systems = Vec::new();
for step in 0..trajectory.nsteps() {
trajectory.read_step(step, &mut frame).expect("failed to read single frame");
systems.push((&frame).into());
}
systems
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
// load the systems from command line argument
let path = std::env::args().nth(1).expect("expected a command line argument");
let mut systems = read_systems_from_file(&path);
// pass hyper-parameters as JSON
let parameters = r#"{
"cutoff": {
"radius": 5.0,
"smoothing": {
"type": "ShiftedCosine",
"width": 0.5
}
},
"density": {
"type": "Gaussian",
"width": 0.3
},
"basis": {
"type": "TensorProduct",
"max_angular": 4,
"radial": {"type": "Gto", "max_radial": 6}
}
}"#;
// create the calculator with its name and parameters
let mut calculator = Calculator::new("soap_power_spectrum", parameters.to_owned())?;
// create the options for a single calculation, here we request the
// calculation of gradients with respect to positions
let options = CalculationOptions {
gradients: &["positions"],
..Default::default()
};
// run the calculation
let descriptor = calculator.compute(&mut systems, options)?;
// Transform the descriptor to dense representation, with one sample for
// each atom-centered environment, and the neighbor atomic types part of the
// properties
let keys_to_move = Labels::empty(vec!["center_type"]);
let descriptor = descriptor.keys_to_samples(&keys_to_move, /* sort_samples */ true)?;
let keys_to_move = Labels::empty(vec!["neighbor_1_type", "neighbor_2_type"]);
let descriptor = descriptor.keys_to_properties(&keys_to_move, /* sort_samples */ true)?;
// descriptor now contains a single block, which can be used as the input
// to standard ML algorithms
let values = descriptor.block_by_id(0).values().to_array();
println!("SOAP representation shape: {:?}", values.shape());
Ok(())
}
#include <iostream>
#include <chemfiles.hpp>
#include <featomic.hpp>
static std::vector<featomic::SimpleSystem> read_systems(const std::string& path);
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "error: expected a command line argument" << std::endl;
return 1;
}
auto systems = read_systems(argv[1]);
// pass hyper-parameters as JSON
const char* parameters = R"({
"cutoff": {
"radius": 5.0,
"smoothing": {"type": "ShiftedCosine", "width": 0.5}
},
"density": {
"type": "Gaussian",
"width": 0.3
},
"basis": {
"type": "TensorProduct",
"max_angular": 6,
"radial": {"type": "Gto", "max_radial": 6}
}
})";
// create the calculator with its name and parameters
auto calculator = featomic::Calculator("soap_power_spectrum", parameters);
// setup the calculation options
auto options = featomic::CalculationOptions();
options.use_native_system = true;
options.gradients.push_back("positions");
// run the calculation
auto descriptor = calculator.compute(systems, options);
// The descriptor is a metatensor `TensorMap`, containing multiple blocks.
// We can transform it to a single block containing a dense representation,
// with one sample for each atom-centered environment.
descriptor.keys_to_samples("center_type");
descriptor.keys_to_properties(std::vector<std::string>{"neighbor_1_type", "neighbor_2_type"});
// extract values from the descriptor in the only remaining block
auto block = descriptor.block_by_id(0);
auto values = block.values();
// you can now use values as the input of a machine learning algorithm
return 0;
}
std::vector<featomic::SimpleSystem> read_systems(const std::string& path) {
auto trajectory = chemfiles::Trajectory(path);
auto systems = std::vector<featomic::SimpleSystem>();
for (size_t step=0; step<trajectory.nsteps(); step++) {
auto frame = trajectory.read_step(step);
auto matrix = frame.cell().matrix();
auto system = featomic::SimpleSystem(featomic::System::CellMatrix{{
{matrix[0][0], matrix[0][1], matrix[0][2]},
{matrix[1][0], matrix[1][1], matrix[1][2]},
{matrix[2][0], matrix[2][1], matrix[2][2]},
}});
auto positions = frame.positions();
for (size_t i=0; i<frame.size(); i++) {
system.add_atom(
static_cast<int32_t>(frame[i].atomic_number().value_or(0)),
{positions[i][0], positions[i][1], positions[i][2]}
);
}
systems.push_back(system);
}
return systems;
}
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <featomic.h>
#include <metatensor.h>
#include "common/systems.h"
static mts_tensormap_t* move_keys_to_samples(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len);
static mts_tensormap_t* move_keys_to_properties(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len);
int main(int argc, char* argv[]) {
int status = FEATOMIC_SUCCESS;
featomic_calculator_t* calculator = NULL;
featomic_system_t* systems = NULL;
uintptr_t n_systems = 0;
double* values = NULL;
const uintptr_t* shape = NULL;
uintptr_t shape_count = 0;
bool got_error = true;
const char* keys_to_samples[] = {"center_type"};
const char* keys_to_properties[] = {"neighbor_1_type", "neighbor_2_type"};
// use the default set of options, computing all samples and all features,
// and including gradients with respect to positions
featomic_calculation_options_t options = {0};
const char* gradients_list[] = {"positions"};
options.gradients = gradients_list;
options.gradients_count = 1;
options.use_native_system = true;
mts_tensormap_t* descriptor = NULL;
mts_block_t* block = NULL;
mts_array_t array = {0};
// hyper-parameters for the calculation as JSON
const char* parameters = "{\n"
"\"cutoff\": {\n"
" \"radius\": 5.0,\n"
" \"smoothing\": {\"type\": \"ShiftedCosine\", \"width\": 0.5}\n"
"},\n"
"\"density\": {\n"
" \"type\": \"Gaussian\",\n"
" \"width\": 0.3\n"
"},\n"
"\"basis\": {\n"
" \"type\": \"TensorProduct\",\n"
" \"max_angular\": 6,\n"
" \"radial\": {\"type\": \"Gto\", \"max_radial\": 6}\n"
"}\n"
"}";
// load systems from command line arguments
if (argc < 2) {
printf("error: expected a command line argument");
goto cleanup;
}
status = read_systems_example(argv[1], &systems, &n_systems);
if (status != 0) {
goto cleanup;
}
// create the calculator with its name and parameters
calculator = featomic_calculator("soap_power_spectrum", parameters);
if (calculator == NULL) {
printf("Error: %s\n", featomic_last_error());
goto cleanup;
}
// run the calculation
status = featomic_calculator_compute(
calculator, &descriptor, systems, n_systems, options
);
if (status != FEATOMIC_SUCCESS) {
printf("Error: %s\n", featomic_last_error());
goto cleanup;
}
// The descriptor is a metatensor `TensorMap`, containing multiple blocks.
// We can transform it to a single block containing a dense representation,
// with one sample for each atom-centered environment.
descriptor = move_keys_to_samples(descriptor, keys_to_samples, 1);
if (descriptor == NULL) {
printf("Error: %s\n", mts_last_error());
goto cleanup;
}
descriptor = move_keys_to_properties(descriptor, keys_to_properties, 2);
if (descriptor == NULL) {
printf("Error: %s\n", mts_last_error());
goto cleanup;
}
// extract the unique block and corresponding values from the descriptor
status = mts_tensormap_block_by_id(descriptor, &block, 0);
if (status != MTS_SUCCESS) {
printf("Error: %s\n", mts_last_error());
goto cleanup;
}
status = mts_block_data(block, &array);
if (status != MTS_SUCCESS) {
printf("Error: %s\n", mts_last_error());
goto cleanup;
}
// callback the functions on the mts_array_t to extract the shape/data pointer
status = array.shape(array.ptr, &shape, &shape_count);
if (status != MTS_SUCCESS) {
printf("Error: %s\n", mts_last_error());
goto cleanup;
}
status = array.data(array.ptr, &values);
if (status != MTS_SUCCESS) {
printf("Error: %s\n", mts_last_error());
goto cleanup;
}
if (status != MTS_SUCCESS) {
printf("Error: %s\n", mts_last_error());
goto cleanup;
}
assert(shape_count == 2);
// you can now use `values` as the input of a machine learning algorithm
printf("the value array shape is %lu x %lu\n", shape[0], shape[1]);
got_error = false;
cleanup:
mts_tensormap_free(descriptor);
featomic_calculator_free(calculator);
free_systems_example(systems, n_systems);
if (got_error) {
return 1;
} else {
return 0;
}
}
mts_tensormap_t* move_keys_to_samples(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len) {
mts_labels_t keys = {0};
mts_tensormap_t* moved_descriptor = NULL;
keys.names = keys_to_move;
keys.size = keys_to_move_len;
keys.values = NULL;
keys.count = 0;
moved_descriptor = mts_tensormap_keys_to_samples(descriptor, keys, true);
mts_tensormap_free(descriptor);
return moved_descriptor;
}
mts_tensormap_t* move_keys_to_properties(mts_tensormap_t* descriptor, const char* keys_to_move[], size_t keys_to_move_len) {
mts_labels_t keys = {0};
mts_tensormap_t* moved_descriptor = NULL;
keys.names = keys_to_move;
keys.size = keys_to_move_len;
keys.values = NULL;
keys.count = 0;
moved_descriptor = mts_tensormap_keys_to_properties(descriptor, keys, true);
mts_tensormap_free(descriptor);
return moved_descriptor;
}
#include "common/systems.c"