Syntax Cheatsheet
OpenMP
Critical section
1#pragma omp critical(name)2{3 // code block4}Barrier
Explicit:
1#pragma omp barrierImplicit: parallel, loop single
Atomic
1#pragma omp atomic2{3 // code block4}atomic read: Read the value of a shared variable atomically
1int x;2#pragma omp atomic read3x = sharedVar; // Read the value of sharedVar atomicallyatomic write: Update the value of a shared variable atomically
1#pragma omp atomic write2sharedVar = newValue; // Update sharedVar atomicallyatomic update: Perform arithmetic or logical operations on a shared variable atomically
1#pragma omp atomic update2sharedVar += 2; // Atomically increment sharedVar by 2atomic capture: Read the current value of a shared variable and then update it with a new value atomically.
Often used for cases where you need to retrieve and update a shared variable together.
1int oldValue;2#pragma omp atomic capture3{4 oldValue = sharedVar; // Read the current value5 sharedVar = newValue; // Update with a new value6}Reduction
Perform a reduction operation on a shared variable across multiple threads
Support +, *, min, max, &, |
1#pragma omp parallel for reduction(+:sum)2for (int i = 0; i < n; ++i) {3 sum += data[i];4}Parallel Region
Run stuff in parallel
1omp_set_num_thread(4);2#pragma omp parallel3{4 // code block5}Parallel Sections
1#pragma omp parallel sections2{3 #pragma omp section4 {5 printf ("id = %d, \n", omp_get_thread_num());6 }7
8 #pragma omp section9 {10 printf ("id = %d, \n", omp_get_thread_num());11 }12}Parallel For
Parallelize for loops
1#pragma omp parallel2{3 int id = omp_get_thread_num();4 cout << id << endl;5 #pragma omp for6 for (int i = 0; i < 100; i++) {7 // do stuff8 }9}10
11// Or12#pragma omp parallel for13for (int i = 0; i < 100; i++) {14 // do stuff15}The collapse clause is used to convert a nested loop into a single loop then parallelize it.
1#pragma omp parallel for collapse(2)2for (int i = 0; i < 4; i++)3{4 for (int j = 0; j < 5; j++)5 {6 // do stuff7 }8}Master
Code is run only by master thread (ID = 0)
1#pragma omp master2{3 // code block4}Single
Code is run by only 1 thread (not necessarily master)
1#pragma omp parallel2{3 // Code executed by all threads4
5 #pragma omp single [private|firstprivate|copyprivate|nowait]6 {7 // Code executed by a single thread8 }9
10 // More code executed by all threads11}private: Specify variables that should have private instances for each thread executing the omp single region.
1int sharedVar = 0;2
3#pragma omp parallel4{5 int privateVar; // Private to each thread6
7 #pragma omp single private(privateVar)8 {9 privateVar = 42; // Initialize privateVar10 }11
12 // privateVar is now thread-local13}- Inside the
omp singleblock,privateVaris initialized to 42, but this change does not affect the outer scope or other threads. - After the
omp singleblock, each thread has its ownprivateVarwith the value 42.
firstprivate: Similar to private but initializes the private copies of variables with values from the outer scope.
1int initialValue = 10;2
3#pragma omp parallel4{5 int privateVar; // Private to each thread6
7 #pragma omp single firstprivate(initialValue, privateVar)8 {9 privateVar = initialValue; // Initialize privateVar with initialValue10 }11
12 // privateVar is now thread-local with the initial value of 1013}- The
firstprivateclause initializes the private copies ofprivateVarwith the value ofinitialValue, which is 10 in this case. - After the
omp singleblock, each thread has its ownprivateVarwith the initial value of 10.
copyprivate: Specify variables that should be copied from the single thread’s context to the context of
all other threads after the omp single block.
1int sharedResult;2
3#pragma omp parallel4{5 int privateResult; // Private to each thread6
7 #pragma omp single copyprivate(sharedResult, privateResult)8 {9 privateResult = 42; // Compute a result10 sharedResult = privateResult; // Share the result with other threads11 }12
13 // sharedResult now has the same value for all threads14}nowait: Indicate that other threads should not wait at a barrier after the omp single block.
CUDA
Kernel Launch
void Kernel_name<<< gridsize, blocksize >>>(arg1,arg2,…);
Memory Management
**cudaError_t cudaMalloc( void **devPtr, size_t size );**
- Example:
cudaMalloc( (void **) &d_c, numbytes );
cudaError_t cudaFree( void *devPtr );
- Example:
cudaFree( d_c );
cudaError_t cudaMemcpy( void *dst, const void src, size_t size, enum cudaMemcpyKind kind );
- enum
cudaMemcpyKindcudaMemcpyHostToDevicecudaMemcpyDeviceToHostcudaMemcpyDeviceToDevice
- Example:
cudaMemcpy( d_c, c, numbytes, cudaMemcpyHostToDevice);
Error Checking
cudaError_t cudaGetLastError(void);char cudaGetErrorString( cudaError_t code );printf(“%s\n”, cudaGetErrorString( cudaGetLastError() ) );
MPI
Set up and tear down
1// Starts up the MPI runtime environment at the beginning of a run.2MPI_Init(&argc, &argv);3
4// Shuts down the MPI runtime environment at the end of a run.5MPI_Finalize();Blocking Point-to-Point
1// Send a message to one process.2int MPI_Send (void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)3
4// Receive a message from one process.5int MPI_Recv (void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)6
7// Count received data elements.8int MPI_Get_count (MPI_Status *status, MPI_Datatype datatype, int *count)9
10// Wait for message arrival.11int MPI_Probe (int source, int tag, MPI_Comm comm, MPI_Status *status)Related Functions: MPI_Bsend, MPI_Ssend, MPI_Rsend, MPI_Buffer_attach, MPI_Buffer_detach, MPI_Sendrecv, MPI_Sendrecv_replace, MPI_Get_elements
Non-blocking Point-to-Point
1// Begin to receive a message.2int MPI_Irecv (void *buf, int count, MPI_Datatype, int source, int tag, MPI_Comm comm, MPI_Request *request)3
4// Complete a non-blocking operation.5int MPI_Wait (MPI_Request *request, MPI_Status *status)6
7// Check or complete a non-blocking operation.8int MPI_Test (MPI_Request *request, int *flag, MPI_Status *status)9
10//Check message arrival.11int MPI_Iprobe (int source, int tag, MPI_Comm comm, int *flag, MPI_Status *status)Derived Datatypes
1// Create a strided homogeneous vector.2int MPI_Type_vector (int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype *newtype)3
4// Save a derived datatype5int MPI_Type_commit (MPI_Datatype *datatype)Collective
1// Send one message to all group members.2int MPI_Bcast (void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm)3
4// Receive from all group members.5int MPI_Gather (void *sendbuf, int sendcount, MPI_Datatype sendtype,6 void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)7
8// Send separate messages to all group members. (§4.6)9int MPI_Scatter (void *sendbuf, int sendcount, MPI_Datatype sendtype,10 void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)11
12// Combine messages from all group members. (§4.9.1)13int MPI_Reduce (void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,14 MPI_Op op, int root, MPI_Comm comm)15
16// performs a reduction of a variable on all processes, and sends result to all processes (and therefore takes longer)17MPI_Allreduce(&value, &value_sum, count, MPI_INT, MPI_SUM, MPI_COMM_WORLD);Communicators
1// Count group members in communicator.2int MPI_Comm_size (MPI_Comm comm, int *size)3
4// Determine group rank of self. (§5.4.1)5int MPI_Comm_rank (MPI_Comm comm, int *rank)6
7// Duplicate with new context. (§5.4.2)8int MPI_Comm_dup (MPI_Comm comm, MPI_Comm *newcomm)9
10// Split into categorized sub-groups. (§5.4.2)11int MPI_Comm_split (MPI_Comm comm, int color, int key, MPI_Comm *newcomm)Communicators with Topology
1// Create with cartesian topology. (§6.5.1)2int MPI_Cart_create (MPI_Comm comm_old, int ndims, int *dims, int *periods,3 int reorder, MPI_Comm *comm_cart)4
5// Suggest balanced dimension ranges. (§6.5.2)6int MPI_Dims_create (int nnodes, int ndims, int *dims)7
8// Determine rank from cartesian coordinates. (§6.5.4)9int MPI_Cart_rank (MPI_Comm comm, int *coords, int *rank)10
11// Determine cartesian coordinates from rank. (§6.5.4)12int MPI_Cart_coords (MPI_Comm comm, int rank, int maxdims, int *coords)13
14// Determine ranks for cartesian shift. (§6.5.5)15int MPI_Cart_shift (MPI_Comm comm, int direction, int disp, int *rank_source, int *rank_dest)16
17// Split into lower dimensional sub-grids. (§6.5.6)18int MPI_Cart_sub (MPI_Comm comm, int *remain_dims, MPI_Comm *newcomm)Constants
- Wildcards:
MPI_ANY_TAG,MPI_ANY_SOURCE - Elementary Datatypes
MPI_CHAR, MPI_SHORT, MPI_INT, MPI_LONG, MPI_UNSIGNED_CHAR, MPI_UNSIGNED_SHORT, MPI_UNSIGNED, MPI_UNSIGNED_LONG, MPI_FLOAT, MPI_DOUBLE, MPI_LONG_DOUBLE, MPI_BYTE, MPI_PACKED - Reserved Communicators:
MPI_COMM_WORLD,MPI_COMM_SELF - Reduction Operations
MPI_MAX, MPI_MIN, MPI_SUM, MPI_PROD, MPI_BAND, MPI_BOR, MPI_BXOR, MPI_LAND, MPI_LOR, MPI_LXOR