Syntax Cheatsheet

OpenMP

Critical section

1
#pragma omp critical(name)
2
{
3
  // code block
4
}

Barrier

Explicit:

1
#pragma omp barrier

Implicit: parallel, loop single

Atomic

1
#pragma omp atomic
2
{
3
  // code block
4
}

atomic read: Read the value of a shared variable atomically

1
int x;
2
#pragma omp atomic read
3
x = sharedVar; // Read the value of sharedVar atomically

atomic write: Update the value of a shared variable atomically

1
#pragma omp atomic write
2
sharedVar = newValue; // Update sharedVar atomically

atomic update: Perform arithmetic or logical operations on a shared variable atomically

1
#pragma omp atomic update
2
sharedVar += 2; // Atomically increment sharedVar by 2

atomic capture: Read the current value of a shared variable and then update it with a new value atomically.

Often used for cases where you need to retrieve and update a shared variable together.

1
int oldValue;
2
#pragma omp atomic capture
3
{
4
    oldValue = sharedVar; // Read the current value
5
    sharedVar = newValue; // Update with a new value
6
}

Reduction

Perform a reduction operation on a shared variable across multiple threads

Support +, *, min, max, &, |

1
#pragma omp parallel for reduction(+:sum)
2
for (int i = 0; i < n; ++i) {
3
    sum += data[i];
4
}

Parallel Region

Run stuff in parallel

1
omp_set_num_thread(4);
2
#pragma omp parallel
3
{
4
  // code block
5
}

Parallel Sections

1
#pragma omp parallel sections
2
{
3
    #pragma omp section
4
    {
5
        printf ("id = %d, \n", omp_get_thread_num());
6
    }
7

8
    #pragma omp section
9
    {
10
        printf ("id = %d, \n", omp_get_thread_num());
11
    }
12
}

Parallel For

Parallelize for loops

1
#pragma omp parallel
2
{
3
  int id = omp_get_thread_num();
4
  cout << id << endl;
5
  #pragma omp for
6
  for (int i = 0; i < 100; i++) {
7
    // do stuff
8
  }
9
}
10

11
// Or
12
#pragma omp parallel for
13
for (int i = 0; i < 100; i++) {
14
  // do stuff
15
}

The collapse clause is used to convert a nested loop into a single loop then parallelize it.

1
#pragma omp parallel for collapse(2)
2
for (int i = 0; i < 4; i++)
3
{
4
    for (int j = 0; j < 5; j++)
5
    {
6
        // do stuff
7
    }
8
}

Master

Code is run only by master thread (ID = 0)

1
#pragma omp master
2
{
3
  // code block
4
}

Single

Code is run by only 1 thread (not necessarily master)

1
#pragma omp parallel
2
{
3
    // Code executed by all threads
4

5
    #pragma omp single [private|firstprivate|copyprivate|nowait]
6
    {
7
        // Code executed by a single thread
8
    }
9

10
    // More code executed by all threads
11
}

private: Specify variables that should have private instances for each thread executing the omp single region.

1
int sharedVar = 0;
2

3
#pragma omp parallel
4
{
5
    int privateVar; // Private to each thread
6

7
    #pragma omp single private(privateVar)
8
    {
9
        privateVar = 42; // Initialize privateVar
10
    }
11

12
    // privateVar is now thread-local
13
}

Inside the omp single block, privateVar is initialized to 42, but this change does not affect the outer scope or other threads.
After the omp single block, each thread has its own privateVar with the value 42.

firstprivate: Similar to private but initializes the private copies of variables with values from the outer scope.

1
int initialValue = 10;
2

3
#pragma omp parallel
4
{
5
    int privateVar; // Private to each thread
6

7
    #pragma omp single firstprivate(initialValue, privateVar)
8
    {
9
        privateVar = initialValue; // Initialize privateVar with initialValue
10
    }
11

12
    // privateVar is now thread-local with the initial value of 10
13
}

The firstprivate clause initializes the private copies of privateVar with the value of initialValue, which is 10 in this case.
After the omp single block, each thread has its own privateVar with the initial value of 10.

copyprivate: Specify variables that should be copied from the single thread’s context to the context of all other threads after the omp single block.

1
int sharedResult;
2

3
#pragma omp parallel
4
{
5
    int privateResult; // Private to each thread
6

7
    #pragma omp single copyprivate(sharedResult, privateResult)
8
    {
9
        privateResult = 42; // Compute a result
10
        sharedResult = privateResult; // Share the result with other threads
11
    }
12

13
    // sharedResult now has the same value for all threads
14
}

nowait: Indicate that other threads should not wait at a barrier after the omp single block.

CUDA

Kernel Launch

void Kernel_name<<< gridsize, blocksize >>>(arg1,arg2,…);

Memory Management

**cudaError_t cudaMalloc( void **devPtr, size_t size );**

Example: cudaMalloc( (void **) &d_c, numbytes );

cudaError_t cudaFree( void *devPtr );

Example: cudaFree( d_c );

cudaError_t cudaMemcpy( void *dst, const void src, size_t size, enum cudaMemcpyKind kind );

enum cudaMemcpyKind
- cudaMemcpyHostToDevice
- cudaMemcpyDeviceToHost
- cudaMemcpyDeviceToDevice
Example: cudaMemcpy( d_c, c, numbytes, cudaMemcpyHostToDevice);

Error Checking

cudaError_t cudaGetLastError(void);
char cudaGetErrorString( cudaError_t code );
printf(“%s\n”, cudaGetErrorString( cudaGetLastError() ) );

MPI

Set up and tear down

1
// Starts up the MPI runtime environment at the beginning of a run.
2
MPI_Init(&argc, &argv);
3

4
// Shuts down the MPI runtime environment at the end of a run.
5
MPI_Finalize();

Blocking Point-to-Point

1
// Send a message to one process.
2
int MPI_Send (void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
3

4
// Receive a message from one process.
5
int MPI_Recv (void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
6

7
// Count received data elements.
8
int MPI_Get_count (MPI_Status *status, MPI_Datatype datatype, int *count)
9

10
// Wait for message arrival.
11
int MPI_Probe (int source, int tag, MPI_Comm comm, MPI_Status *status)

Related Functions: MPI_Bsend, MPI_Ssend, MPI_Rsend, MPI_Buffer_attach, MPI_Buffer_detach, MPI_Sendrecv, MPI_Sendrecv_replace, MPI_Get_elements

Non-blocking Point-to-Point

1
// Begin to receive a message.
2
int MPI_Irecv (void *buf, int count, MPI_Datatype, int source, int tag, MPI_Comm comm, MPI_Request *request)
3

4
// Complete a non-blocking operation.
5
int MPI_Wait (MPI_Request *request, MPI_Status *status)
6

7
// Check or complete a non-blocking operation.
8
int MPI_Test (MPI_Request *request, int *flag, MPI_Status *status)
9

10
//Check message arrival.
11
int MPI_Iprobe (int source, int tag, MPI_Comm comm, int *flag, MPI_Status *status)

Derived Datatypes

1
// Create a strided homogeneous vector.
2
int MPI_Type_vector (int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype *newtype)
3

4
// Save a derived datatype
5
int MPI_Type_commit (MPI_Datatype *datatype)

Collective

1
// Send one message to all group members.
2
int MPI_Bcast (void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
3

4
// Receive from all group members.
5
int MPI_Gather (void *sendbuf, int sendcount, MPI_Datatype sendtype,
6
                void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
7

8
// Send separate messages to all group members. (§4.6)
9
int MPI_Scatter (void *sendbuf, int sendcount, MPI_Datatype sendtype,
10
                 void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
11

12
// Combine messages from all group members. (§4.9.1)
13
int MPI_Reduce (void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
14
                MPI_Op op, int root, MPI_Comm comm)
15

16
// performs a reduction of a variable on all processes, and sends result to all processes (and therefore takes longer)
17
MPI_Allreduce(&value, &value_sum, count, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

Communicators

1
// Count group members in communicator.
2
int MPI_Comm_size (MPI_Comm comm, int *size)
3

4
// Determine group rank of self. (§5.4.1)
5
int MPI_Comm_rank (MPI_Comm comm, int *rank)
6

7
// Duplicate with new context. (§5.4.2)
8
int MPI_Comm_dup (MPI_Comm comm, MPI_Comm *newcomm)
9

10
// Split into categorized sub-groups. (§5.4.2)
11
int MPI_Comm_split (MPI_Comm comm, int color, int key, MPI_Comm *newcomm)

Communicators with Topology

1
// Create with cartesian topology. (§6.5.1)
2
int MPI_Cart_create (MPI_Comm comm_old, int ndims, int *dims, int *periods,
3
                     int reorder, MPI_Comm *comm_cart)
4

5
// Suggest balanced dimension ranges. (§6.5.2)
6
int MPI_Dims_create (int nnodes, int ndims, int *dims)
7

8
// Determine rank from cartesian coordinates. (§6.5.4)
9
int MPI_Cart_rank (MPI_Comm comm, int *coords, int *rank)
10

11
// Determine cartesian coordinates from rank. (§6.5.4)
12
int MPI_Cart_coords (MPI_Comm comm, int rank, int maxdims, int *coords)
13

14
// Determine ranks for cartesian shift. (§6.5.5)
15
int MPI_Cart_shift (MPI_Comm comm, int direction, int disp, int *rank_source, int *rank_dest)
16

17
// Split into lower dimensional sub-grids. (§6.5.6)
18
int MPI_Cart_sub (MPI_Comm comm, int *remain_dims, MPI_Comm *newcomm)

Constants

Wildcards: MPI_ANY_TAG, MPI_ANY_SOURCE
Elementary Datatypes MPI_CHAR, MPI_SHORT, MPI_INT, MPI_LONG, MPI_UNSIGNED_CHAR, MPI_UNSIGNED_SHORT, MPI_UNSIGNED, MPI_UNSIGNED_LONG, MPI_FLOAT, MPI_DOUBLE, MPI_LONG_DOUBLE, MPI_BYTE, MPI_PACKED
Reserved Communicators: MPI_COMM_WORLD, MPI_COMM_SELF
Reduction Operations MPI_MAX, MPI_MIN, MPI_SUM, MPI_PROD, MPI_BAND, MPI_BOR, MPI_BXOR, MPI_LAND, MPI_LOR, MPI_LXOR