1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#define _GNU_SOURCE
19#include <stdio.h>
20#include <sys/mman.h>
21#include <sys/types.h>
22#include <sys/wait.h>
23#include <sys/ipc.h>
24#include <sys/shm.h>
25#include <sys/stat.h>
26#include <sys/time.h>
27#include <linux/futex.h>
28#include <unistd.h>
29#include <asm/unistd.h>
30#include <string.h>
31#include <stdlib.h>
32#include <fcntl.h>
33#include <sched.h>
34#include <time.h>
35#include <stdarg.h>
36#include <pthread.h>
37#include <signal.h>
38#include <sys/prctl.h>
39
40static inline void dcbf(volatile unsigned int *addr)
41{
42 __asm__ __volatile__ ("dcbf %y0; sync" : : "Z"(*(unsigned char *)addr) : "memory");
43}
44
45static void err_msg(char *msg)
46{
47
48 time_t now;
49 time(&now);
50 printf("=================================\n");
51 printf(" Error: %s\n", msg);
52 printf(" %s", ctime(&now));
53 printf("=================================\n");
54 exit(1);
55}
56
57static char *map1;
58static char *map2;
59static pid_t rim_process_pid;
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81static volatile unsigned int corruption_found;
82
83
84
85
86
87
88
89
90
91
92#define MAX_THREADS 64
93#define THREAD_ID_BITS 8
94#define THREAD_ID_MASK ((1 << THREAD_ID_BITS) - 1)
95static unsigned int rim_thread_ids[MAX_THREADS];
96static pthread_t rim_threads[MAX_THREADS];
97
98
99
100
101
102
103
104
105
106
107
108#define RIM_CHUNK_SIZE 1024
109#define BITS_PER_BYTE 8
110#define WORD_SIZE (sizeof(unsigned int))
111#define WORD_BITS (WORD_SIZE * BITS_PER_BYTE)
112#define WORDS_PER_CHUNK (RIM_CHUNK_SIZE/WORD_SIZE)
113
114static inline char *compute_chunk_start_addr(unsigned int thread_id)
115{
116 char *chunk_start;
117
118 chunk_start = (char *)((unsigned long)map1 +
119 (thread_id * RIM_CHUNK_SIZE));
120
121 return chunk_start;
122}
123
124
125
126
127
128
129
130
131
132#define WORD_OFFSET_BITS (__builtin_ctz(WORDS_PER_CHUNK))
133#define WORD_OFFSET_MASK ((1 << WORD_OFFSET_BITS) - 1)
134
135static inline unsigned int compute_word_offset(char *start, unsigned int *addr)
136{
137 unsigned int delta_bytes, ret;
138 delta_bytes = (unsigned long)addr - (unsigned long)start;
139
140 ret = delta_bytes/WORD_SIZE;
141
142 return ret;
143}
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158#define SWEEP_ID_BITS (WORD_BITS - (THREAD_ID_BITS + WORD_OFFSET_BITS))
159#define SWEEP_ID_MASK ((1 << SWEEP_ID_BITS) - 1)
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193#define SWEEP_ID_SHIFT 0
194#define WORD_OFFSET_SHIFT (SWEEP_ID_BITS)
195#define THREAD_ID_SHIFT (WORD_OFFSET_BITS + SWEEP_ID_BITS)
196
197
198
199
200
201static inline unsigned int compute_store_pattern(unsigned int tid,
202 unsigned int *addr,
203 unsigned int sweep_id)
204{
205 unsigned int ret = 0;
206 char *start = compute_chunk_start_addr(tid);
207 unsigned int word_offset = compute_word_offset(start, addr);
208
209 ret += (tid & THREAD_ID_MASK) << THREAD_ID_SHIFT;
210 ret += (word_offset & WORD_OFFSET_MASK) << WORD_OFFSET_SHIFT;
211 ret += (sweep_id & SWEEP_ID_MASK) << SWEEP_ID_SHIFT;
212 return ret;
213}
214
215
216static inline unsigned int extract_tid(unsigned int pattern)
217{
218 unsigned int ret;
219
220 ret = (pattern >> THREAD_ID_SHIFT) & THREAD_ID_MASK;
221 return ret;
222}
223
224
225static inline unsigned int extract_word_offset(unsigned int pattern)
226{
227 unsigned int ret;
228
229 ret = (pattern >> WORD_OFFSET_SHIFT) & WORD_OFFSET_MASK;
230
231 return ret;
232}
233
234
235static inline unsigned int extract_sweep_id(unsigned int pattern)
236
237{
238 unsigned int ret;
239
240 ret = (pattern >> SWEEP_ID_SHIFT) & SWEEP_ID_MASK;
241
242 return ret;
243}
244
245
246
247
248
249
250#define LOGDIR_NAME_SIZE 100
251static char logdir[LOGDIR_NAME_SIZE];
252
253static FILE *fp[MAX_THREADS];
254static const char logfilename[] ="Thread-%02d-Chunk";
255
256static inline void start_verification_log(unsigned int tid,
257 unsigned int *addr,
258 unsigned int cur_sweep_id,
259 unsigned int prev_sweep_id)
260{
261 FILE *f;
262 char logfile[30];
263 char path[LOGDIR_NAME_SIZE + 30];
264 char separator[2] = "/";
265 char *chunk_start = compute_chunk_start_addr(tid);
266 unsigned int size = RIM_CHUNK_SIZE;
267
268 sprintf(logfile, logfilename, tid);
269 strcpy(path, logdir);
270 strcat(path, separator);
271 strcat(path, logfile);
272 f = fopen(path, "w");
273
274 if (!f) {
275 err_msg("Unable to create logfile\n");
276 }
277
278 fp[tid] = f;
279
280 fprintf(f, "----------------------------------------------------------\n");
281 fprintf(f, "PID = %d\n", rim_process_pid);
282 fprintf(f, "Thread id = %02d\n", tid);
283 fprintf(f, "Chunk Start Addr = 0x%016lx\n", (unsigned long)chunk_start);
284 fprintf(f, "Chunk Size = %d\n", size);
285 fprintf(f, "Next Store Addr = 0x%016lx\n", (unsigned long)addr);
286 fprintf(f, "Current sweep-id = 0x%08x\n", cur_sweep_id);
287 fprintf(f, "Previous sweep-id = 0x%08x\n", prev_sweep_id);
288 fprintf(f, "----------------------------------------------------------\n");
289}
290
291static inline void log_anamoly(unsigned int tid, unsigned int *addr,
292 unsigned int expected, unsigned int observed)
293{
294 FILE *f = fp[tid];
295
296 fprintf(f, "Thread %02d: Addr 0x%lx: Expected 0x%x, Observed 0x%x\n",
297 tid, (unsigned long)addr, expected, observed);
298 fprintf(f, "Thread %02d: Expected Thread id = %02d\n", tid, extract_tid(expected));
299 fprintf(f, "Thread %02d: Observed Thread id = %02d\n", tid, extract_tid(observed));
300 fprintf(f, "Thread %02d: Expected Word offset = %03d\n", tid, extract_word_offset(expected));
301 fprintf(f, "Thread %02d: Observed Word offset = %03d\n", tid, extract_word_offset(observed));
302 fprintf(f, "Thread %02d: Expected sweep-id = 0x%x\n", tid, extract_sweep_id(expected));
303 fprintf(f, "Thread %02d: Observed sweep-id = 0x%x\n", tid, extract_sweep_id(observed));
304 fprintf(f, "----------------------------------------------------------\n");
305}
306
307static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies)
308{
309 FILE *f = fp[tid];
310 char logfile[30];
311 char path[LOGDIR_NAME_SIZE + 30];
312 char separator[] = "/";
313
314 fclose(f);
315
316 if (nr_anamolies == 0) {
317 remove(path);
318 return;
319 }
320
321 sprintf(logfile, logfilename, tid);
322 strcpy(path, logdir);
323 strcat(path, separator);
324 strcat(path, logfile);
325
326 printf("Thread %02d chunk has %d corrupted words. For details check %s\n",
327 tid, nr_anamolies, path);
328}
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357static void verify_chunk(unsigned int tid, unsigned int *next_store_addr,
358 unsigned int cur_sweep_id,
359 unsigned int prev_sweep_id)
360{
361 unsigned int *iter_ptr;
362 unsigned int size = RIM_CHUNK_SIZE;
363 unsigned int expected;
364 unsigned int observed;
365 char *chunk_start = compute_chunk_start_addr(tid);
366
367 int nr_anamolies = 0;
368
369 start_verification_log(tid, next_store_addr,
370 cur_sweep_id, prev_sweep_id);
371
372 for (iter_ptr = (unsigned int *)chunk_start;
373 (unsigned long)iter_ptr < (unsigned long)chunk_start + size;
374 iter_ptr++) {
375 unsigned int expected_sweep_id;
376
377 if (iter_ptr < next_store_addr) {
378 expected_sweep_id = cur_sweep_id;
379 } else {
380 expected_sweep_id = prev_sweep_id;
381 }
382
383 expected = compute_store_pattern(tid, iter_ptr, expected_sweep_id);
384
385 dcbf((volatile unsigned int*)iter_ptr);
386 observed = *iter_ptr;
387
388 if (observed != expected) {
389 nr_anamolies++;
390 log_anamoly(tid, iter_ptr, expected, observed);
391 }
392 }
393
394 end_verification_log(tid, nr_anamolies);
395}
396
397static void set_pthread_cpu(pthread_t th, int cpu)
398{
399 cpu_set_t run_cpu_mask;
400 struct sched_param param;
401
402 CPU_ZERO(&run_cpu_mask);
403 CPU_SET(cpu, &run_cpu_mask);
404 pthread_setaffinity_np(th, sizeof(cpu_set_t), &run_cpu_mask);
405
406 param.sched_priority = 1;
407 if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) {
408
409 fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
410 }
411}
412
413static void set_mycpu(int cpu)
414{
415 cpu_set_t run_cpu_mask;
416 struct sched_param param;
417
418 CPU_ZERO(&run_cpu_mask);
419 CPU_SET(cpu, &run_cpu_mask);
420 sched_setaffinity(0, sizeof(cpu_set_t), &run_cpu_mask);
421
422 param.sched_priority = 1;
423 if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) {
424 fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
425 }
426}
427
428static volatile int segv_wait;
429
430static void segv_handler(int signo, siginfo_t *info, void *extra)
431{
432 while (segv_wait) {
433 sched_yield();
434 }
435
436}
437
438static void set_segv_handler(void)
439{
440 struct sigaction sa;
441
442 sa.sa_flags = SA_SIGINFO;
443 sa.sa_sigaction = segv_handler;
444
445 if (sigaction(SIGSEGV, &sa, NULL) == -1) {
446 perror("sigaction");
447 exit(EXIT_FAILURE);
448 }
449}
450
451int timeout = 0;
452
453
454
455
456
457
458static void *rim_fn(void *arg)
459{
460 unsigned int tid = *((unsigned int *)arg);
461
462 int size = RIM_CHUNK_SIZE;
463 char *chunk_start = compute_chunk_start_addr(tid);
464
465 unsigned int prev_sweep_id;
466 unsigned int cur_sweep_id = 0;
467
468
469 unsigned int pattern = cur_sweep_id;
470 unsigned int *pattern_ptr = &pattern;
471 unsigned int *w_ptr, read_data;
472
473 set_segv_handler();
474
475
476
477
478
479
480
481
482
483
484 for (w_ptr = (unsigned int *)chunk_start;
485 (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
486 w_ptr++) {
487
488 *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
489 *w_ptr = *pattern_ptr;
490 }
491
492 while (!corruption_found && !timeout) {
493 prev_sweep_id = cur_sweep_id;
494 cur_sweep_id = cur_sweep_id + 1;
495
496 for (w_ptr = (unsigned int *)chunk_start;
497 (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
498 w_ptr++) {
499 unsigned int old_pattern;
500
501
502
503
504
505
506 old_pattern = compute_store_pattern(tid, w_ptr, prev_sweep_id);
507
508
509
510
511
512 dcbf((volatile unsigned int*)w_ptr);
513
514
515 read_data = *w_ptr;
516
517
518
519
520
521 if (read_data != old_pattern) {
522
523 corruption_found = 1;
524 }
525
526
527
528
529
530 if (corruption_found || timeout) {
531
532
533
534
535
536
537
538
539 verify_chunk(tid, w_ptr, cur_sweep_id, prev_sweep_id);
540
541 return 0;
542 }
543
544
545
546
547
548 *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
549
550
551
552
553
554 *w_ptr = *pattern_ptr;
555 }
556 }
557
558 return NULL;
559}
560
561
562static unsigned long start_cpu = 0;
563static unsigned long nrthreads = 4;
564
565static pthread_t mem_snapshot_thread;
566
567static void *mem_snapshot_fn(void *arg)
568{
569 int page_size = getpagesize();
570 size_t size = page_size;
571 void *tmp = malloc(size);
572
573 while (!corruption_found && !timeout) {
574
575 segv_wait = 1;
576
577 mprotect(map1, size, PROT_READ);
578
579
580
581
582
583 memcpy(tmp, map1, size);
584
585
586
587
588
589
590
591 memcpy(map2, tmp, size);
592
593
594
595
596 asm volatile("sync" ::: "memory");
597 mprotect(map1, size, PROT_READ|PROT_WRITE);
598 asm volatile("sync" ::: "memory");
599 segv_wait = 0;
600
601 usleep(1);
602 }
603
604 return 0;
605}
606
607void alrm_sighandler(int sig)
608{
609 timeout = 1;
610}
611
612int main(int argc, char *argv[])
613{
614 int c;
615 int page_size = getpagesize();
616 time_t now;
617 int i, dir_error;
618 pthread_attr_t attr;
619 key_t shm_key = (key_t) getpid();
620 int shmid, run_time = 20 * 60;
621 struct sigaction sa_alrm;
622
623 snprintf(logdir, LOGDIR_NAME_SIZE,
624 "/tmp/logdir-%u", (unsigned int)getpid());
625 while ((c = getopt(argc, argv, "r:hn:l:t:")) != -1) {
626 switch(c) {
627 case 'r':
628 start_cpu = strtoul(optarg, NULL, 10);
629 break;
630 case 'h':
631 printf("%s [-r <start_cpu>] [-n <nrthreads>] [-l <logdir>] [-t <timeout>]\n", argv[0]);
632 exit(0);
633 break;
634 case 'n':
635 nrthreads = strtoul(optarg, NULL, 10);
636 break;
637 case 'l':
638 strncpy(logdir, optarg, LOGDIR_NAME_SIZE - 1);
639 break;
640 case 't':
641 run_time = strtoul(optarg, NULL, 10);
642 break;
643 default:
644 printf("invalid option\n");
645 exit(0);
646 break;
647 }
648 }
649
650 if (nrthreads > MAX_THREADS)
651 nrthreads = MAX_THREADS;
652
653 shmid = shmget(shm_key, page_size, IPC_CREAT|0666);
654 if (shmid < 0) {
655 err_msg("Failed shmget\n");
656 }
657
658 map1 = shmat(shmid, NULL, 0);
659 if (map1 == (void *) -1) {
660 err_msg("Failed shmat");
661 }
662
663 map2 = shmat(shmid, NULL, 0);
664 if (map2 == (void *) -1) {
665 err_msg("Failed shmat");
666 }
667
668 dir_error = mkdir(logdir, 0755);
669
670 if (dir_error) {
671 err_msg("Failed mkdir");
672 }
673
674 printf("start_cpu list:%lu\n", start_cpu);
675 printf("number of worker threads:%lu + 1 snapshot thread\n", nrthreads);
676 printf("Allocated address:0x%016lx + secondary map:0x%016lx\n", (unsigned long)map1, (unsigned long)map2);
677 printf("logdir at : %s\n", logdir);
678 printf("Timeout: %d seconds\n", run_time);
679
680 time(&now);
681 printf("=================================\n");
682 printf(" Starting Test\n");
683 printf(" %s", ctime(&now));
684 printf("=================================\n");
685
686 for (i = 0; i < nrthreads; i++) {
687 if (1 && !fork()) {
688 prctl(PR_SET_PDEATHSIG, SIGKILL);
689 set_mycpu(start_cpu + i);
690 for (;;)
691 sched_yield();
692 exit(0);
693 }
694 }
695
696
697 sa_alrm.sa_handler = &alrm_sighandler;
698 sigemptyset(&sa_alrm.sa_mask);
699 sa_alrm.sa_flags = 0;
700
701 if (sigaction(SIGALRM, &sa_alrm, 0) == -1) {
702 err_msg("Failed signal handler registration\n");
703 }
704
705 alarm(run_time);
706
707 pthread_attr_init(&attr);
708 for (i = 0; i < nrthreads; i++) {
709 rim_thread_ids[i] = i;
710 pthread_create(&rim_threads[i], &attr, rim_fn, &rim_thread_ids[i]);
711 set_pthread_cpu(rim_threads[i], start_cpu + i);
712 }
713
714 pthread_create(&mem_snapshot_thread, &attr, mem_snapshot_fn, map1);
715 set_pthread_cpu(mem_snapshot_thread, start_cpu + i);
716
717
718 pthread_join(mem_snapshot_thread, NULL);
719 for (i = 0; i < nrthreads; i++) {
720 pthread_join(rim_threads[i], NULL);
721 }
722
723 if (!timeout) {
724 time(&now);
725 printf("=================================\n");
726 printf(" Data Corruption Detected\n");
727 printf(" %s", ctime(&now));
728 printf(" See logfiles in %s\n", logdir);
729 printf("=================================\n");
730 return 1;
731 }
732 return 0;
733}
734