Team CosmoStat
Example of reconstructing the initial conditions of the Universe by gradient descent
import tensorflow as tf
import numpy as np
import flowpm
cosmo = flowpm.cosmology.Planck15()
stages = np.linspace(0.1, 1.0, 10, endpoint=True)
initial_conditions = flowpm.linear_field(32, # size of the cube
100, # Physical size of the cube
ipklin, # Initial power spectrum
batch_size=16)
# Sample particles
state = flowpm.lpt_init(cosmo, initial_conditions, a0=0.1)
# Evolve particles down to z=0
final_state = flowpm.nbody(cosmo, state, stages, 32)
# Retrieve final density field
final_field = flowpm.cic_paint(tf.zeros_like(initial_conditions), final_state[0])
FlowPM on single-GPU
#A simple 2 layer fully connected network
#y=Relu(xw+bias)v
#Define dimensions
batch = mtf.Dimension("batch", b)
io = mtf.Dimension("io", d_io)
hidden = mtf.Dimension("hidden", d_h)
# x.shape == [batch, io]
#Get tensors
w = mtf.get_variable("w", shape=[io, hidden])
bias = mtf.get_variable("bias", shape=[hidden])
v = mtf.get_variable("v", shape=[hidden, io])
#Define operations
h = mtf.relu(mtf.einsum(x, w, output_shape=[batch, hidden]) + bias)
y = mtf.einsum(h, v, output_shape=[batch, io])
batch = mtf.Dimension("batch", b)
io = mtf.Dimension("io", d_io)
hidden = mtf.Dimension("hidden", d_h)
#Define mesh dimensions and layout
mesh_shape = [("processor_rows", 2), ("processor_cols", 2)]
layout_rules = [("batch", "processor_rows"),
("hidden", "processor_cols")]
Comparison of distributed simulation on 4 GPUs
But... we are hitting performance and scalability issues
gpus = tf.config.experimental.list_logical_devices('GPU')
if gpus:
# Replicate your computation on multiple GPUs
c = []
for gpu in gpus:
with tf.device(gpu.name):
a = tf.constant([[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0],
[3.0, 4.0],
[5.0, 6.0]])
c.append(tf.matmul(a, b))
with tf.device('/CPU:0'):
matmul_sum = tf.add_n(c)
print(matmul_sum)
# Resolve the cluster from SLURM environment
cluster = tf.distribute.cluster_resolver.SlurmClusterResolver(
{"mesh": mesh_shape.size // FLAGS.gpus_per_task},
port_base=8822,
gpus_per_node=FLAGS.gpus_per_node,
gpus_per_task=FLAGS.gpus_per_task,
tasks_per_node=FLAGS.tasks_per_node)
cluster_spec = cluster.cluster_spec()
# Create a server for all mesh members
server = tf.distribute.Server(cluster_spec, "mesh",
cluster.task_id)
# Only he master job takes care of the graph building,
# everyone else can just chill for now
if cluster.task_id > 0:
server.join()
# Otherwise we are the main task, let's define the devices
mesh_devices = [
"/job:mesh/task:%d/device:GPU:%d" % (i, j)
for i in range(cluster_spec.num_tasks("mesh"))
for j in range(FLAGS.gpus_per_task)
]
print("List of devices", mesh_devices)
mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
mesh_shape, layout_rules, mesh_devices)
from mpi4py import MPI
import horovod.tensorflow as hvd
# This will be our baseline world communicator
comm = MPI.COMM_WORLD
# Split COMM_WORLD into subcommunicators
subcomm = MPI.COMM_WORLD.Split(color=MPI.COMM_WORLD.rank % 2,
key=MPI.COMM_WORLD.rank)
# Initialize horovod with an array of communicators
hvd.init(comm=[comm, subcomm])
[....]
# AlltoAll over the WORLD communicator
hvd.alltoall(a, communicator_id=0)
# AlltoAll over second (splitted) communicator
hvd.alltoall(a, communicator_id=1)
mesh_impl = HvdSimdMeshImpl(mtf.convert_to_shape(mesh_shape),
mtf.convert_to_layout_rules(layout_rules))
# And internally, HvdSimdMeshImpl essentially does:
# Use MPI to define needed communicators
cart_comm = MPI.COMM_WORLD.Create_cart(dims=dims,
periods=[False]*len(dims))
[...]
# Initializing horovod with our set of communicators
hvd.init(comm=[c for _, c in comms.items()])
# When collectives are called:
t = hvd.alltoall(t, communicator_id=self._comms_id[name_dim])
GPU utilisation for 3D FFT in device placement Mesh TF