1 /* ex: set tabstop=2 expandtab shiftwidth=2 softtabstop=2: */
2 /*
3 *
4 * Copyright (c) 2003 The Regents of the University of California. All
5 * rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * - Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * - Neither the name of the University nor the names of its
15 * contributors may be used to endorse or promote products derived
16 * from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
26 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32 /*
33 *
34 * Author: Nithya Ramanthan
35 *
36 */
37
38 /* NR todo: fix #root-detections, can sometimes use
39 * outdated routes - dont just specify a node is a root
40 * just because its route is outdated! */
41
42 #include <sympathy.h>
43 #include <sim/radio.h>
44
45 static void check_sink_collisions();
46
47 static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t*
48 good_rx, buf_t* fault_buf);
49
50 /* For now just check if the next-hop is in the
51 * neighbor list */
52 static
53 void check_events(sympathy_node_info_t* stat)
54 {
55 if (stat->next_hop.next_hop == 0) return;
56
57 /* See if the next-hop selected is in the neighbor list */
58 if ((find_neighbor(stat, (Saddr_t) stat->next_hop.next_hop))< 0) {
59 bufprintf(stat->fault_buf, "ERROR: Next-hop: %d is not a neighbor!\n", stat->next_hop.next_hop);
60 }
61 }
62
63 static
64 void log_failed(int* error_events, int code)
65 {
66 *error_events |= S_CODE(code);
67 }
68
69 static
70 int node_has_neighbors(sympathy_node_info_t* stat)
71 {
72 return (stat->num_neighbors > 0);
73 }
74
75 static
76 int route_to_sink(sympathy_node_info_t* stat)
77 {
78 uint8_t i;
79
80 /* If this is the sink, see if anybody has a route to the sink */
81 if (stat->addr == my_node_id) {
82 for (i = 0; i < sink.num_srcs; i++) {
83 if (route_valid(&sink.status_srcs[i])) {
84 if (sink.status_srcs[i].next_hop.sink == my_node_id) return 1;
85 }
86 }
87 }
88
89 /* Otherwise just check this node */
90 else if (route_valid(stat))
91 {
92 return(stat->next_hop.sink == my_node_id);
93 }
94
95 return 0;
96 }
97
98 /* Node is claimed to have not been heard from IF: no nodes have it on their
99 * neighbor list AND they have current metrics */
100 static
101 int node_heard_from(sympathy_node_info_t* stat)
102 {
103 uint8_t i, j;
104 buf_t* neighbor_buf = buf_new();
105
106 stat->num_heard_this_node = 0;
107
108 /* Parse metrics to see if anybody claims node as a neighbor */
109 for (i = 0; i < sink.num_srcs; i++)
110 {
111 if (sink.status_srcs[i].addr != stat->addr)
112 {
113 for (j = 0; j < sink.status_srcs[i].num_neighbors; j++)
114 {
115 if (sink.status_srcs[i].neighbors[j].node_id
116 == stat->addr)
117 {
118 if (neighbors_valid(&sink.status_srcs[i]))
119 {
120 stat->num_heard_this_node++;
121 bufprintf(neighbor_buf, "%d,", sink.status_srcs[i].addr);
122 }
123 }
124 }
125 }
126 }
127 #ifdef USE_BAYES
128 if (stat->num_heard_this_node > stat->max_num_heard_this_node)
129 stat->max_num_heard_this_node = stat->num_heard_this_node;
130 #endif
131
132 if (neighbor_buf->len > 0) {
133 bufprintf(stat->topology_info, "Num neighbors heard this node: %d {%s}\n",
134 stat->num_heard_this_node, neighbor_buf->buf);
135 }
136 else bufprintf(stat->topology_info, "NO NEIGHBORS heard this node!\n");
137 buf_free(neighbor_buf);
138 return (stat->num_heard_this_node > 0);
139 }
140
141
142 static
143 void check_sink_collisions()
144 {
145 sympathy_node_info_t *stat = find_status_ptr(my_node_id);
146 if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
147 log_failed(&stat->error_events, S_NO_COLLISIONS);
148 stat->congestion_detected = 1;
149 }
150 }
151
152 static void update_counters(sympathy_node_info_t* stat, uint8_t clear)
153 {
154 int j;
155 sympathy_node_app_info_t* snode;
156
157 stats_ctr_update(&stat->metrics_rx,clear, 0);
158 stats_ctr_update(&stat->sympathy_stats_rx,clear, 0);
159
160 for (j = 0; j < NUM_TOS_PKT_TYPES; j++)
161 {
162 stats_ctr_update(&stat->tos_packets[j],clear, 0);
163 }
164 stats_ctr_update(&stat->errs_rx,clear, 0);
165
166 for (j = 0; j < sink.num_apps_registered; j++)
167 {
168 snode = &stat->node_app_info[j];
169 stats_ctr_update(&snode->node_num_pkts_rx,clear, 0);
170 stats_ctr_update(&snode->node_send_failures,clear, 0);
171 stats_ctr_update(&snode->node_max_queue_occupancy,clear, 0);
172 stats_ctr_update(&snode->node_num_pkts_dropped,clear, 0);
173 stats_ctr_update(&snode->node_num_pkts_tx,clear, 1);
174 stats_ctr_update(&snode->app_stats_rx_from_node,clear, 0);
175 stats_ctr_update(&snode->sink_pkt_tx,clear, 0);
176 stats_ctr_update(&snode->sink_pkt_rx,clear, 0);
177 stats_ctr_update(&snode->sink_pkt_expected_rx,clear, 0);
178 }
179 stats_ctr_update(&stat->time_awake_mins,clear, 0);
180 stats_ctr_update(&stat->num_metrics_tx,clear, 1);
181 stats_ctr_update(&stat->num_stats_tx,clear, 1);
182 stats_ctr_update(&stat->num_pkts_tx_succeeded,clear, 0);
183 stats_ctr_update(&stat->num_pkts_rx,clear, 0);
184 stats_ctr_update(&stat->num_pkts_dropped,clear, 0);
185 stats_ctr_update(&stat->num_pkts_tx_failed,clear, 0);
186 stats_ctr_update(&stat->num_pkts_crc_error,clear, 0);
187
188 if (clear) stat->rebooted = 0;
189 }
190
191 void clear_buf(buf_t** buf)
192 {
193 buf_free(*buf);
194 (*buf) = buf_new();
195 }
196
197
198 /**** Running Tests ****/
199
200 /*
201 static void
202 compare_link_quality(sympathy_node_info_t* stat)
203 {
204 float quality;
205 int i;
206
207 for (i = 0; i < stat->num_neighbors; i++) {
208 if (stat->neighbor_info[i].sim_link_quality < 0) {
209 elog(LOG_ERR, "ERROR Couldnt get simulation link quality for nodes %d -> %d\n",
210 stat->addr, stat->neighbors[i].node_id);
211 continue;
212 }
213 quality = 100 * (stat->neighbors[i].quality/255);
214 elog(LOG_DEBUG(1), "sim-link: %f, reported: %d, diff: %f\n",
215 stat->neighbor_info[i].sim_link_quality,
216 quality,
217 abs_float(stat->neighbor_info[i].sim_link_quality -
218 quality));
219 }
220 }
221 */
222
223 /* application-specific tests */
224 static int
225 check_insufficient_msgs(int msgs_have, int msgs_expected, int msg_reception_percent)
226 {
227 return ((msgs_have < (msgs_expected * msg_reception_percent)/100)
228 || ((msgs_have == 0) && (msgs_expected > 0)));
229 }
230
231 /* Check if node got requests */
232 static
233 int received_requests(sympathy_node_app_info_t* snode, buf_t* fault_buf)
234 {
235 if (check_insufficient_msgs(snode->node_num_pkts_rx.agg_prev_epoch,
236 snode->sink_pkt_tx.agg_prev_epoch, snode->pkt_reception_percent))
237 {
238 bufprintf(fault_buf,"\t0x%x: Num reqs node rx/Num reqs sink tx: %d/%d\n",
239 S_CODE(S_COMP_RX_REQS), snode->node_num_pkts_rx.agg_prev_epoch,
240 snode->sink_pkt_tx.agg_prev_epoch);
241 return 0;
242 }
243 return 1;
244 }
245
246 /* Compared to #pkts sink is expecting, is node sending
247 * sufficient responses */
248 static
249 int comp_tx_data(sympathy_node_app_info_t* snode, buf_t* fault_buf)
250 {
251 if (check_insufficient_msgs(snode->node_num_pkts_tx.agg_prev_epoch,
252 snode->sink_pkt_expected_rx.agg_prev_epoch, snode->pkt_reception_percent))
253 {
254 bufprintf(fault_buf,
255 "\tComp tx/ sink expected: %d/%d\n",
256 snode->node_num_pkts_tx.agg_prev_epoch,
257 snode->sink_pkt_expected_rx.agg_prev_epoch);
258 return 0;
259 }
260 return 1;
261 }
262
263 static
264 int node_tx_metrics(sympathy_node_info_t* stat)
265 {
266 return (stat->num_metrics_tx.agg_prev_epoch > 0);
267 }
268
269 int received_data(stats_ctr_t* pkts_rx, char* type, buf_t* fault_buf)
270 {
271 // NR why dont this work??
272 // int x = get_minutes_since_event(&pkts_rx->last_updated);
273 // if (x >= (EPOCH_MSEC/60000)) {
274 if (pkts_rx->agg_prev_epoch == 0) {
275 bufprintf(fault_buf, "\t%s: Num pkts rx: %d(%d)\n",
276 type, pkts_rx->ctr, pkts_rx->agg_prev_epoch);
277 return 0;
278 }
279 return 1;
280 }
281
282 /* Check how much data node tx compared to requests rx */
283 static
284 int receiving_data_node_tx(stats_ctr_t* pkts_sink_rx,
285 stats_ctr_t* pkts_node_tx, buf_t* fault_buf, int pkt_reception_percent)
286 {
287 if (check_insufficient_msgs(pkts_sink_rx->agg_prev_epoch,
288 pkts_node_tx->agg_prev_epoch, pkt_reception_percent))
289 {
290 bufprintf(fault_buf,
291 "\tsink rx pkts/Node tx pkts: %d/%d\n",
292 pkts_sink_rx->agg_prev_epoch, pkts_node_tx->agg_prev_epoch);
293 return 0;
294 }
295 return 1;
296 }
297
298 static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t*
299 good_rx, buf_t* fault_buf)
300 {
301 if ((errors_rx->agg_prev_epoch > 0)
302 && (errors_rx->agg_prev_epoch
303 >= (good_rx->agg_prev_epoch*PERCENT_GOOD_PACKETS_CONGESTION)/100))
304 {
305 bufprintf(fault_buf, "\tRx bad/good %d/%d\n",
306 errors_rx->agg_prev_epoch, good_rx->agg_prev_epoch);
307 return 1;
308 }
309 return 0;
310 }
311
312 static
313 int received_sufficient_data(stats_ctr_t* pkts_rx,
314 stats_ctr_t* expected_pkts_rx, buf_t* fault_buf, int pkt_reception_percent)
315 {
316 if (check_insufficient_msgs(pkts_rx->agg_prev_epoch,
317 expected_pkts_rx->agg_prev_epoch, pkt_reception_percent))
318 {
319 bufprintf(fault_buf, "\tSink rx/Expected to rx: %d/%d\n",
320 pkts_rx->agg_prev_epoch, expected_pkts_rx->agg_prev_epoch);
321 elog(LOG_DEBUG(1), "CHECK fault-buf: %s\n", fault_buf->buf);
322 return 0;
323 }
324
325 return 1;
326 }
327
328 /* For this test, we have to measure the exact time since
329 * the sink received a packet from the node */
330 static int received_some_pkts_from_node(sympathy_node_info_t* stat)
331 {
332 return event_valid(&stat->packet.last_updated);
333 //int x = get_minutes_since_event(&stat->packet.last_updated);
334 //elog(LOG_DEBUG(1), "CHECK mins-since rx packet: %x\n",
335 //x);
336 //if (x >= (EPOCH_MSEC/60000)) return 0;
337 //return 1;
338 }
339
340 #ifdef USE_BAYES
341 int received_non_symp_app_pkts_from_node(sympathy_node_info_t* stat)
342 {
343 return (stat->tos_packets[SNON_ROUTING_PKT] > 0);
344 }
345 #endif
346
347 /**** General Testing Framework ***/
348 static
349 void find_nodes_with_same_next_hop(sympathy_node_info_t* stat)
350 {
351 int i;
352
353 stat->num_with_same_next_hop = 0;
354
355 /* Find other nodes with the same next-hop as this node */
356 for (i = 0; i < sink.num_srcs; i++)
357 {
358 if (sink.status_srcs[i].addr != stat->addr)
359 {
360 if ((stat->next_hop.next_hop > 0)
361 && (sink.status_srcs[i].next_hop.next_hop == stat->next_hop.next_hop)
362 && (sink.status_srcs[i].next_hop.sink == stat->next_hop.sink)
363 && (route_valid(&sink.status_srcs[i])))
364 {
365 bufprintf(stat->topology_info, ", %d ", sink.status_srcs[i].addr);
366 stat->nodes_with_same_next_hop[stat->num_with_same_next_hop] =
367 sink.status_srcs[i].addr;
368 if (sink.status_srcs[i].failure_type > SFL_OK) {
369 bufprintf(stat->topology_info, "(root-cause=%s)",
370 decode_root_cause(sink.status_srcs[i].failure_type,
371 sink.status_srcs[i].addr));
372 }
373 stat->num_with_same_next_hop++;
374 }
375 }
376 }
377
378 if (stat->num_with_same_next_hop) {
379 bufprintf(stat->topology_info, "have the same next-hop(%d)!\n",
380 stat->next_hop.next_hop);
381 }
382 }
383
384 /*** Global Functions ***/
385
386 int check_passed(int error_events, int code)
387 {
388 return((error_events & S_CODE(code)) == 0);
389 }
390
391 int call_track_lost_nodes(void* data, int interval, g_event_t* event)
392 {
393 int i;
394 track_lost_nodes();
395
396 /* Increment the window, and then clear all the counters for this current
397 * window */
398 inc_mod((uint16_t *) &sink.window, 1, TRACK_FAILURE_WINDOW_SIZE);
399 sink.metric_pd++;
400
401 /* Have to update_counters AFTER incrementing sink.window! */
402 for (i = 0; i < sink.num_srcs; i++)
403 {
404 update_counters(&sink.status_srcs[i], 1);
405 if (sink.status_srcs[i].failure_type > SFL_OK) {
406 sympathy_emview_text(&sink.status_srcs[i]);
407 }
408 }
409 g_status_dev_notify(sink.metrics_status);
410 return EVENT_RENEW;
411 }
412
413 /* These are in no specific order */
414 static
415 void step1_run_tests(sympathy_node_info_t* stat)
416 {
417 if (!route_to_sink(stat)) {
418 log_failed(&stat->error_events,S_ROUTE_TO_SINK);
419 }
420
421 if (!node_has_neighbors(stat)) {
422 log_failed(&stat->error_events,S_NEIGHBORS);
423 }
424
425 /* Check if node received mostly good packets from other nodes */
426 if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
427 log_failed(&stat->error_events, S_NO_COLLISIONS);
428 stat->congestion_detected = 1;
429 }
430
431 if (!received_data(&stat->metrics_rx, "data", stat->fault_buf)) {
432 log_failed(&stat->error_events, S_RX_DATA_THIS_PD);
433 }
434
435 if (!received_sufficient_data(&stat->metrics_rx,
436 &sink.expected_num_sympathy_metrics, stat->fault_buf,
437 SMSG_RECEPTION_THRESH_DEFAULT)) {
438 elog(LOG_DEBUG(1), "CHECK node %d didnt rx suff data!\n",
439 stat->addr);
440 log_failed(&stat->error_events, S_RX_SUFFICIENT_DATA);
441 }
442
443 if (!received_data(&stat->sympathy_stats_rx, "stats", stat->fault_buf)) {
444 log_failed(&stat->error_events, S_RX_STATS);
445 }
446
447 if (!node_tx_metrics(stat)) log_failed(&stat->error_events, S_COMP_TX_DATA);
448
449 if (!receiving_data_node_tx(&stat->sympathy_stats_rx,
450 &stat->num_stats_tx, stat->fault_buf, SMSG_RECEPTION_THRESH_DEFAULT))
451 {
452 log_failed(&stat->error_events,S_RX_STATS_COMP_TX);
453 }
454 if (!received_some_pkts_from_node(stat))
455 {
456 log_failed(&stat->error_events,S_RX_SOME_PKTS_FROM_NODE);
457 }
458 if (!node_heard_from(stat))
459 {
460 log_failed(&stat->error_events,S_NODE_HEARD_FROM);
461 }
462 }
463
464 static void step1_check_component(sympathy_node_app_info_t* snode)
465 {
466 /* If we haven't received app metrics, then we cant categorize! */
467 if (!received_data(&snode->app_stats_rx_from_node, "stats", snode->fault_buf))
468 {
469 log_failed(&snode->error_events, S_RX_STATS);
470 }
471 if (!received_requests(snode, snode->fault_buf))
472 {
473 log_failed(&snode->error_events, S_COMP_RX_REQS);
474 }
475 if (!comp_tx_data(snode, snode->fault_buf))
476 {
477 log_failed(&snode->error_events, S_COMP_TX_DATA);
478 }
479 if (!received_sufficient_data(&snode->sink_pkt_rx,
480 &snode->sink_pkt_expected_rx, snode->fault_buf, snode->pkt_reception_percent))
481 {
482 log_failed(&snode->error_events, S_RX_SUFFICIENT_DATA);
483 }
484
485 if (!received_data(&snode->sink_pkt_rx, "data", snode->fault_buf))
486 {
487 log_failed(&snode->error_events, S_RX_DATA_THIS_PD);
488 }
489 }
490
491 // Check for failures as much as is possible - so we use the receipt
492 // of metrics to determine if we have received sufficient data from
493 // a node. We will then go on to check the components.
494 static
495 int step2_set_failure(int error_events)
496 {
497 if (!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE)) {
498 return SFL_NO_DATA;
499 }
500 else if (!check_passed(error_events, S_RX_SUFFICIENT_DATA)) {
501 return SFL_INSUFFICIENT_DATA;
502 }
503 return SFL_OK;
504 }
505
506 /* Returns failure for system, and sets root-cause */
507 static
508 int step3_root_cause_system_failure(sympathy_node_info_t* stat)
509 {
510 int failure = SFL_NO_DATA;
511 check_sink_collisions();
512
513 if (!node_has_neighbors(stat)) {
514 stat->failure_root_cause = SRC_NO_NEIGHBORS;
515 }
516 else if (!node_heard_from(stat)) {
517 stat->failure_root_cause = SRC_NOBODY_CLAIMS_SINK_AS_NEIGHBOR;
518 }
519 else if (!route_to_sink(stat)) {
520 stat->failure_root_cause = SRC_NO_ROUTE;
521 }
522 else failure = SFL_OK;
523 elog(LOG_DEBUG(1), "CHECK system failure = %d, root-cause: %d\n",
524 failure, stat->failure_root_cause);
525 return failure;
526 }
527
528 static
529 int step3_root_cause_failure(sympathy_node_info_t* stat, int error_events,
530 int test_events)
531 {
532 if (test_events) check_events(stat);
533
534 if (stat->rebooted) return SRC_NODE_REBOOTED;
535
536 else if ((!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE))
537 && (!check_passed(stat->error_events, S_NODE_HEARD_FROM))) {
538 return SRC_NODE_FAILED;
539 }
540
541 else if (!check_passed(stat->error_events, S_NEIGHBORS)) {
542 return SRC_NO_NEIGHBORS;
543 }
544
545 else if (!check_passed(stat->error_events, S_ROUTE_TO_SINK)) {
546 return SRC_NO_ROUTE;
547 }
548
549 /* If the sink has'nt received statistics from this component,
550 * then sympathy can't do anything, and just assumes that the
551 * sink is not receiving data sent by the node. */
552 else if (!check_passed(error_events, S_RX_STATS)) {
553 return SRC_BAD_PATH_TO_SINK;
554 }
555
556 /* maybe its beacuse the component is not receiving the
557 * requests */
558 else if (!check_passed(error_events, S_COMP_RX_REQS)) {
559 return SRC_BAD_PATH_TO_NODE;
560 }
561 /* OW because the node is not transmitting data in response */
562 else if (!check_passed(error_events, S_COMP_TX_DATA)) {
563 return SRC_BAD_NODE_TRANSMIT;
564 }
565
566 return SRC_BAD_PATH_TO_SINK;
567 }
568
569 /* This func is only called if the node has a failure to begin with */
570 static
571 void step4_localize_failure(sympathy_node_info_t* stat)
572 {
573 sympathy_node_info_t* curr_stat = NULL;
574 sympathy_node_info_t* sink_stat = find_status_ptr(my_node_id);
575 int iter = 0;
576
577 stat->failure_localization = S_SELF;
578 find_nodes_with_same_next_hop(stat);
579
580 /* If the sink has a failure, then all failures are localized
581 * to the sink - other than the sink's failure, which is localized to
582 * itself. */
583 if (stat->addr == my_node_id) {
584 stat->failure_localization = S_SELF;
585 return;
586 }
587
588 /* If the node rebooted, nothing can explain that along the path */
589 else if (stat->rebooted) {
590 stat->failure_localization = S_SELF;
591 return;
592 }
593
594 /* If the sink has a failure, or the node's failure is just a communication
595 * issue and the sink is experiencing congestion, then localize it to the
596 * sink */
597 else if ((sink_stat->failure_type > SFL_OK)
598 || ((stat->failure_root_cause < SRC_NO_NEIGHBORS)
599 && (sink_stat->congestion_detected))) {
600 stat->failure_localization = S_SINK;
601 stat->source_node_failure = my_node_id;
602 return;
603 }
604
605 /* Otherwise we try to find the source of the failure somewhere in the
606 * network. Even if the route is not valid, we still use it as an indicator
607 * for fault localization. This is a reasonable thing to do because the
608 * route is probably invalid as a result of the fault. */
609 //else if (route_valid(stat)) {
610 else {
611 curr_stat = stat;
612 while ((curr_stat = iter_next_hop(curr_stat, &iter,
613 curr_stat->next_hop.sink))) {
614
615 /* If this node along the path has no root-caused failure
616 * then it cannot be the source of the node's failure */
617 if (!(curr_stat->congestion_detected
618 || (curr_stat->failure_type > SFL_OK))) continue;
619
620 /* If we find a worse root-cause on a node closer to the sink
621 * (worse defined by ordering of root-causes, then that node
622 * is the source of the current node's problems.
623 * There are some exceptions to this ordering.
624 * If the current node's failure is less critical than
625 * no-neighbors, then it can be explained by any failure
626 * downstream from it - NOT just one that is "worse" */
627 if ((curr_stat->failure_root_cause >= stat->failure_root_cause)
628 || (stat->failure_root_cause < SRC_NO_NEIGHBORS)) {
629 stat->failure_localization = S_PATH;
630 stat->source_node_failure = curr_stat->addr;
631 }
632
633 /* If the next hop isn't valid, we can't guage the rest of
634 * the route, so we keep the current status */
635 if (!route_valid(curr_stat)) break;
636 }
637 }
638
639 /* If failure localized to self, and there is congestion at this node,
640 * then we will localize it to the path, but specify the node as itself */
641 if (stat->failure_localization == S_SELF) {
642 if (stat->congestion_detected) {
643 stat->source_node_failure = stat->addr;
644 stat->failure_localization = S_PATH;
645 }
646 }
647
648 return;
649 }
650
651 void track_lost_nodes()
652 {
653 int i, tmp_root_cause, j;
654 sympathy_node_app_info_t* snode;
655 sympathy_node_info_t* stat;
656 int notify = 0;
657
658 elog(LOG_DEBUG(1), "window: %d, metric-pd: %d\n", sink.window,
659 sink.metric_pd);
660
661 /* Only include nodes whom we have heard from in the past
662 * epoch - in analysis of other nodes */
663 for (i = 0; i < sink.num_srcs; i++)
664 {
665 stat = &sink.status_srcs[i];
666
667 /* Calculate values for last-epoch */
668 update_counters(stat, 0);
669
670 /* Clear previous values, which are stored until next calculation */
671 stat->error_events = 0;
672 clear_buf(&stat->fault_buf);
673 clear_buf(&stat->topology_info);
674
675 if (!received_data(&stat->metrics_rx, "data", NULL)) {
676 stat->metrics_valid = 0;
677 }
678 else stat->metrics_valid = 1;
679 stat->congestion_detected = 0;
680 }
681
682 /* Don't begin checking for failures until we have had
683 * TRACK_FAILURE_WINDOW_SIZE metrics periods */
684 if (sink.metric_pd < TRACK_FAILURE_WINDOW_SIZE) return;
685
686 /* Set the failure category for all nodes */
687 for (i = 0; i < sink.num_srcs; i++)
688 {
689 stat = &sink.status_srcs[i];
690 tmp_root_cause = stat->failure_root_cause;
691
692 /* If the node is a sink, then look for system wide failures because the
693 * sink itself won't have any failures we want to detect */
694 if (stat->addr == my_node_id)
695 {
696 stat->failure_type = step3_root_cause_system_failure(stat);
697 }
698
699 /* Otherwise check failures on the node, sympathy and remaining components */
700 else
701 {
702 step1_run_tests(stat);
703
704 // NR make sympathy one of the components and check it as such?
705 stat->failure_type = step2_set_failure(stat->error_events);
706
707 /* Then check metrics for each application registered with sympathy */
708 for (j = 0; j < sink.num_apps_registered; j++) {
709 snode = &stat->node_app_info[j];
710 snode->error_events = 0;
711 clear_buf(&snode->fault_buf);
712
713 /* Run tests on component */
714 step1_check_component(snode);
715 snode->failure_type = step2_set_failure(snode->error_events);
716
717 /* If we have a failure from this node, then try to root-cause it */
718 if (snode->failure_type > SFL_OK) {
719 step3_root_cause_failure(stat, snode->error_events, 0);
720 }
721
722 /* If the current failure assignment for the node is an OK, and a component
723 * has a failure, then we set the current failure assignment
724 * to Insufficient data */
725 if ((stat->failure_type == SFL_OK) && (snode->failure_type > SFL_OK)) {
726 stat->failure_type = SFL_INSUFFICIENT_DATA;
727 stat->failure_root_cause = snode->failure_root_cause;
728 }
729 }
730
731 /* If we have a failure from this node, then try to root-cause it */
732 if (stat->failure_type > SFL_OK) {
733 stat->failure_root_cause =
734 step3_root_cause_failure(stat, stat->error_events, 1);
735 elog(LOG_DEBUG(1), "CHECK node %d had failure: %d, root-cause: %d\n",
736 stat->addr, stat->failure_type, stat->failure_root_cause);
737 }
738 }
739
740 /* If this root-cause is new, we note it and notify devices */
741 if (stat->failure_type > SFL_OK) {
742 if (tmp_root_cause != stat->failure_root_cause) {
743 stat->period_root_caused = sink.metric_pd;
744 notify = 1;
745 }
746 }
747 }
748
749 if (notify) {
750 g_status_dev_notify(sink.summary_status);
751 g_status_dev_notify(sink.fail_status);
752 }
753
754 #ifdef USE_BAYES
755 /* create bayes network based on routes */
756 bayes_classify_network(my_node_id);
757 #endif
758
759 /* Clear the agg_prev_epoch values, and update counters */
760 stats_ctr_update(&sink.expected_num_sympathy_metrics, 0, 0);
761 stats_ctr_update(&sink.expected_num_sympathy_metrics, 1, 0);
762
763 /* Diagnose the failure - either caused by congestion along
764 * the path, or what? */
765 for (j = 0; j < sink.num_srcs; j++) {
766 stat = &sink.status_srcs[j];
767 if (stat->failure_type > SFL_OK) step4_localize_failure(stat);
768 }
769 }
770
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.