Skip to content

Syntax Cheatsheet

OpenMP

Critical section

1
#pragma omp critical(name)
2
{
3
// code block
4
}

Barrier

Explicit:

1
#pragma omp barrier

Implicit: parallel, loop single

Atomic

1
#pragma omp atomic
2
{
3
// code block
4
}

atomic read: Read the value of a shared variable atomically

1
int x;
2
#pragma omp atomic read
3
x = sharedVar; // Read the value of sharedVar atomically

atomic write: Update the value of a shared variable atomically

1
#pragma omp atomic write
2
sharedVar = newValue; // Update sharedVar atomically

atomic update: Perform arithmetic or logical operations on a shared variable atomically

1
#pragma omp atomic update
2
sharedVar += 2; // Atomically increment sharedVar by 2

atomic capture: Read the current value of a shared variable and then update it with a new value atomically.

Often used for cases where you need to retrieve and update a shared variable together.

1
int oldValue;
2
#pragma omp atomic capture
3
{
4
oldValue = sharedVar; // Read the current value
5
sharedVar = newValue; // Update with a new value
6
}

Reduction

Perform a reduction operation on a shared variable across multiple threads

Support +, *, min, max, &, |

1
#pragma omp parallel for reduction(+:sum)
2
for (int i = 0; i < n; ++i) {
3
sum += data[i];
4
}

Parallel Region

Run stuff in parallel

1
omp_set_num_thread(4);
2
#pragma omp parallel
3
{
4
// code block
5
}

Parallel Sections

1
#pragma omp parallel sections
2
{
3
#pragma omp section
4
{
5
printf ("id = %d, \n", omp_get_thread_num());
6
}
7
8
#pragma omp section
9
{
10
printf ("id = %d, \n", omp_get_thread_num());
11
}
12
}

Parallel For

Parallelize for loops

1
#pragma omp parallel
2
{
3
int id = omp_get_thread_num();
4
cout << id << endl;
5
#pragma omp for
6
for (int i = 0; i < 100; i++) {
7
// do stuff
8
}
9
}
10
11
// Or
12
#pragma omp parallel for
13
for (int i = 0; i < 100; i++) {
14
// do stuff
15
}

The collapse clause is used to convert a nested loop into a single loop then parallelize it.

1
#pragma omp parallel for collapse(2)
2
for (int i = 0; i < 4; i++)
3
{
4
for (int j = 0; j < 5; j++)
5
{
6
// do stuff
7
}
8
}

Master

Code is run only by master thread (ID = 0)

1
#pragma omp master
2
{
3
// code block
4
}

Single

Code is run by only 1 thread (not necessarily master)

1
#pragma omp parallel
2
{
3
// Code executed by all threads
4
5
#pragma omp single [private|firstprivate|copyprivate|nowait]
6
{
7
// Code executed by a single thread
8
}
9
10
// More code executed by all threads
11
}

private: Specify variables that should have private instances for each thread executing the omp single region.

1
int sharedVar = 0;
2
3
#pragma omp parallel
4
{
5
int privateVar; // Private to each thread
6
7
#pragma omp single private(privateVar)
8
{
9
privateVar = 42; // Initialize privateVar
10
}
11
12
// privateVar is now thread-local
13
}
  • Inside the omp single block, privateVar is initialized to 42, but this change does not affect the outer scope or other threads.
  • After the omp single block, each thread has its own privateVar with the value 42.

firstprivate: Similar to private but initializes the private copies of variables with values from the outer scope.

1
int initialValue = 10;
2
3
#pragma omp parallel
4
{
5
int privateVar; // Private to each thread
6
7
#pragma omp single firstprivate(initialValue, privateVar)
8
{
9
privateVar = initialValue; // Initialize privateVar with initialValue
10
}
11
12
// privateVar is now thread-local with the initial value of 10
13
}
  • The firstprivate clause initializes the private copies of privateVar with the value of initialValue, which is 10 in this case.
  • After the omp single block, each thread has its own privateVar with the initial value of 10.

copyprivate: Specify variables that should be copied from the single thread’s context to the context of all other threads after the omp single block.

1
int sharedResult;
2
3
#pragma omp parallel
4
{
5
int privateResult; // Private to each thread
6
7
#pragma omp single copyprivate(sharedResult, privateResult)
8
{
9
privateResult = 42; // Compute a result
10
sharedResult = privateResult; // Share the result with other threads
11
}
12
13
// sharedResult now has the same value for all threads
14
}

nowait: Indicate that other threads should not wait at a barrier after the omp single block.

CUDA

Kernel Launch

void Kernel_name<<< gridsize, blocksize >>>(arg1,arg2,…);

Memory Management

**cudaError_t cudaMalloc( void **devPtr, size_t size );**

  • Example: cudaMalloc( (void **) &d_c, numbytes );

cudaError_t cudaFree( void *devPtr );

  • Example: cudaFree( d_c );

cudaError_t cudaMemcpy( void *dst, const void src, size_t size, enum cudaMemcpyKind kind );

  • enum cudaMemcpyKind
    • cudaMemcpyHostToDevice
    • cudaMemcpyDeviceToHost
    • cudaMemcpyDeviceToDevice
  • Example: cudaMemcpy( d_c, c, numbytes, cudaMemcpyHostToDevice);

Error Checking

  • cudaError_t cudaGetLastError(void);
  • char cudaGetErrorString( cudaError_t code );
  • printf(“%s\n”, cudaGetErrorString( cudaGetLastError() ) );

MPI

Set up and tear down

1
// Starts up the MPI runtime environment at the beginning of a run.
2
MPI_Init(&argc, &argv);
3
4
// Shuts down the MPI runtime environment at the end of a run.
5
MPI_Finalize();

Blocking Point-to-Point

1
// Send a message to one process.
2
int MPI_Send (void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
3
4
// Receive a message from one process.
5
int MPI_Recv (void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
6
7
// Count received data elements.
8
int MPI_Get_count (MPI_Status *status, MPI_Datatype datatype, int *count)
9
10
// Wait for message arrival.
11
int MPI_Probe (int source, int tag, MPI_Comm comm, MPI_Status *status)

Related Functions: MPI_Bsend, MPI_Ssend, MPI_Rsend, MPI_Buffer_attach, MPI_Buffer_detach, MPI_Sendrecv, MPI_Sendrecv_replace, MPI_Get_elements

Non-blocking Point-to-Point

1
// Begin to receive a message.
2
int MPI_Irecv (void *buf, int count, MPI_Datatype, int source, int tag, MPI_Comm comm, MPI_Request *request)
3
4
// Complete a non-blocking operation.
5
int MPI_Wait (MPI_Request *request, MPI_Status *status)
6
7
// Check or complete a non-blocking operation.
8
int MPI_Test (MPI_Request *request, int *flag, MPI_Status *status)
9
10
//Check message arrival.
11
int MPI_Iprobe (int source, int tag, MPI_Comm comm, int *flag, MPI_Status *status)

Derived Datatypes

1
// Create a strided homogeneous vector.
2
int MPI_Type_vector (int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype *newtype)
3
4
// Save a derived datatype
5
int MPI_Type_commit (MPI_Datatype *datatype)

Collective

1
// Send one message to all group members.
2
int MPI_Bcast (void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
3
4
// Receive from all group members.
5
int MPI_Gather (void *sendbuf, int sendcount, MPI_Datatype sendtype,
6
void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
7
8
// Send separate messages to all group members. (§4.6)
9
int MPI_Scatter (void *sendbuf, int sendcount, MPI_Datatype sendtype,
10
void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
11
12
// Combine messages from all group members. (§4.9.1)
13
int MPI_Reduce (void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
14
MPI_Op op, int root, MPI_Comm comm)
15
16
// performs a reduction of a variable on all processes, and sends result to all processes (and therefore takes longer)
17
MPI_Allreduce(&value, &value_sum, count, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

Communicators

1
// Count group members in communicator.
2
int MPI_Comm_size (MPI_Comm comm, int *size)
3
4
// Determine group rank of self. (§5.4.1)
5
int MPI_Comm_rank (MPI_Comm comm, int *rank)
6
7
// Duplicate with new context. (§5.4.2)
8
int MPI_Comm_dup (MPI_Comm comm, MPI_Comm *newcomm)
9
10
// Split into categorized sub-groups. (§5.4.2)
11
int MPI_Comm_split (MPI_Comm comm, int color, int key, MPI_Comm *newcomm)

Communicators with Topology

1
// Create with cartesian topology. (§6.5.1)
2
int MPI_Cart_create (MPI_Comm comm_old, int ndims, int *dims, int *periods,
3
int reorder, MPI_Comm *comm_cart)
4
5
// Suggest balanced dimension ranges. (§6.5.2)
6
int MPI_Dims_create (int nnodes, int ndims, int *dims)
7
8
// Determine rank from cartesian coordinates. (§6.5.4)
9
int MPI_Cart_rank (MPI_Comm comm, int *coords, int *rank)
10
11
// Determine cartesian coordinates from rank. (§6.5.4)
12
int MPI_Cart_coords (MPI_Comm comm, int rank, int maxdims, int *coords)
13
14
// Determine ranks for cartesian shift. (§6.5.5)
15
int MPI_Cart_shift (MPI_Comm comm, int direction, int disp, int *rank_source, int *rank_dest)
16
17
// Split into lower dimensional sub-grids. (§6.5.6)
18
int MPI_Cart_sub (MPI_Comm comm, int *remain_dims, MPI_Comm *newcomm)

Constants

  • Wildcards: MPI_ANY_TAG, MPI_ANY_SOURCE
  • Elementary Datatypes MPI_CHAR, MPI_SHORT, MPI_INT, MPI_LONG, MPI_UNSIGNED_CHAR, MPI_UNSIGNED_SHORT, MPI_UNSIGNED, MPI_UNSIGNED_LONG, MPI_FLOAT, MPI_DOUBLE, MPI_LONG_DOUBLE, MPI_BYTE, MPI_PACKED
  • Reserved Communicators: MPI_COMM_WORLD, MPI_COMM_SELF
  • Reduction Operations MPI_MAX, MPI_MIN, MPI_SUM, MPI_PROD, MPI_BAND, MPI_BOR, MPI_BXOR, MPI_LAND, MPI_LOR, MPI_LXOR