~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
cvs/emstar/devel/sympathy_devel/sympathy_analyze.c


  1 /* ex: set tabstop=2 expandtab shiftwidth=2 softtabstop=2: */
  2 /*
  3  *
  4  * Copyright (c) 2003 The Regents of the University of California.  All 
  5  * rights reserved.
  6  *
  7  * Redistribution and use in source and binary forms, with or without
  8  * modification, are permitted provided that the following conditions
  9  * are met:
 10  *
 11  * - Redistributions of source code must retain the above copyright
 12  *   notice, this list of conditions and the following disclaimer.
 13  *
 14  * - Neither the name of the University nor the names of its
 15  *   contributors may be used to endorse or promote products derived
 16  *   from this software without specific prior written permission.
 17  *
 18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
 19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 20  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 21  * PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
 22  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 25  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 26  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29  *
 30  */
 31 
 32  /*
 33   *
 34   * Author: Nithya Ramanthan
 35   *
 36   */
 37 
 38 /* NR todo: fix #root-detections, can sometimes use
 39  * outdated routes - dont just specify a node is a root
 40  * just because its route is outdated! */
 41 
 42 #include <sympathy.h>
 43 #include <sim/radio.h>
 44 
 45 static void check_sink_collisions();
 46 
 47 static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t* 
 48     good_rx, buf_t* fault_buf);
 49 
 50 /* For now just check if the next-hop is in the
 51  * neighbor list */
 52 static
 53 void check_events(sympathy_node_info_t* stat)
 54 {
 55   if (stat->next_hop.next_hop == 0) return;
 56 
 57   /* See if the next-hop selected is in the neighbor list */
 58   if ((find_neighbor(stat, (Saddr_t) stat->next_hop.next_hop))< 0) {
 59     bufprintf(stat->fault_buf, "ERROR: Next-hop: %d is not a neighbor!\n", stat->next_hop.next_hop);
 60   }
 61 }
 62 
 63 static
 64 void log_failed(int* error_events, int code)
 65 {
 66   *error_events |= S_CODE(code);
 67 }
 68 
 69 static 
 70 int node_has_neighbors(sympathy_node_info_t* stat)
 71 {
 72   return (stat->num_neighbors > 0);
 73 }
 74 
 75 static 
 76 int route_to_sink(sympathy_node_info_t* stat)
 77 {
 78   uint8_t i;
 79 
 80   /* If this is the sink, see if anybody has a route to the sink */
 81   if (stat->addr == my_node_id) {
 82     for (i = 0; i < sink.num_srcs; i++) {
 83       if (route_valid(&sink.status_srcs[i])) {
 84         if (sink.status_srcs[i].next_hop.sink == my_node_id) return 1;
 85       }
 86     }
 87   }
 88 
 89   /* Otherwise just check this node */
 90   else if (route_valid(stat))
 91   {
 92     return(stat->next_hop.sink == my_node_id);
 93   }
 94 
 95   return 0;
 96 }
 97 
 98 /* Node is claimed to have not been heard from IF: no nodes have it on their 
 99  * neighbor list AND they have current metrics */
100 static
101 int node_heard_from(sympathy_node_info_t* stat)
102 {
103   uint8_t i, j;
104   buf_t* neighbor_buf = buf_new();
105 
106   stat->num_heard_this_node = 0;
107 
108   /* Parse metrics to see if anybody claims node as a neighbor */
109   for (i = 0; i < sink.num_srcs; i++)
110   {
111     if (sink.status_srcs[i].addr != stat->addr) 
112     {
113       for (j = 0; j < sink.status_srcs[i].num_neighbors; j++)
114       {
115               if (sink.status_srcs[i].neighbors[j].node_id 
116                                                 == stat->addr) 
117         {
118           if (neighbors_valid(&sink.status_srcs[i]))
119           {
120             stat->num_heard_this_node++;
121             bufprintf(neighbor_buf, "%d,", sink.status_srcs[i].addr); 
122           }
123         }
124       }
125     }
126   }
127 #ifdef USE_BAYES
128   if (stat->num_heard_this_node > stat->max_num_heard_this_node)
129     stat->max_num_heard_this_node = stat->num_heard_this_node;
130 #endif
131 
132   if (neighbor_buf->len > 0) {
133     bufprintf(stat->topology_info, "Num neighbors heard this node: %d {%s}\n", 
134         stat->num_heard_this_node, neighbor_buf->buf);
135   }
136   else bufprintf(stat->topology_info, "NO NEIGHBORS heard this node!\n");
137   buf_free(neighbor_buf);
138   return (stat->num_heard_this_node > 0);
139 }
140 
141 
142 static 
143 void check_sink_collisions()
144 {
145   sympathy_node_info_t *stat = find_status_ptr(my_node_id);
146   if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
147     log_failed(&stat->error_events, S_NO_COLLISIONS);
148     stat->congestion_detected = 1;
149   }
150 }
151 
152 static void update_counters(sympathy_node_info_t* stat, uint8_t clear)
153 {
154   int j;
155   sympathy_node_app_info_t* snode;
156 
157   stats_ctr_update(&stat->metrics_rx,clear, 0);
158   stats_ctr_update(&stat->sympathy_stats_rx,clear, 0);
159 
160   for (j = 0; j < NUM_TOS_PKT_TYPES; j++)
161   {
162     stats_ctr_update(&stat->tos_packets[j],clear, 0);
163   }
164   stats_ctr_update(&stat->errs_rx,clear, 0);
165 
166   for (j = 0; j < sink.num_apps_registered; j++)
167   {
168     snode = &stat->node_app_info[j];
169     stats_ctr_update(&snode->node_num_pkts_rx,clear, 0);
170     stats_ctr_update(&snode->node_send_failures,clear, 0); 
171     stats_ctr_update(&snode->node_max_queue_occupancy,clear, 0);
172     stats_ctr_update(&snode->node_num_pkts_dropped,clear, 0);
173     stats_ctr_update(&snode->node_num_pkts_tx,clear, 1);
174     stats_ctr_update(&snode->app_stats_rx_from_node,clear, 0);
175     stats_ctr_update(&snode->sink_pkt_tx,clear, 0);
176     stats_ctr_update(&snode->sink_pkt_rx,clear, 0);
177     stats_ctr_update(&snode->sink_pkt_expected_rx,clear, 0);
178   }
179   stats_ctr_update(&stat->time_awake_mins,clear, 0);
180   stats_ctr_update(&stat->num_metrics_tx,clear, 1);
181   stats_ctr_update(&stat->num_stats_tx,clear, 1);
182   stats_ctr_update(&stat->num_pkts_tx_succeeded,clear, 0);
183   stats_ctr_update(&stat->num_pkts_rx,clear, 0);
184   stats_ctr_update(&stat->num_pkts_dropped,clear, 0);
185   stats_ctr_update(&stat->num_pkts_tx_failed,clear, 0);
186   stats_ctr_update(&stat->num_pkts_crc_error,clear, 0);
187 
188   if (clear) stat->rebooted = 0;
189 }
190 
191 void clear_buf(buf_t** buf)
192 {
193   buf_free(*buf);
194   (*buf) = buf_new();
195 }
196 
197 
198 /**** Running Tests ****/
199 
200 /*
201 static void
202 compare_link_quality(sympathy_node_info_t* stat)
203 {
204   float quality;
205   int i;
206 
207   for (i = 0; i < stat->num_neighbors; i++) {
208     if (stat->neighbor_info[i].sim_link_quality < 0) {
209       elog(LOG_ERR, "ERROR Couldnt get simulation link quality for nodes %d -> %d\n", 
210         stat->addr, stat->neighbors[i].node_id);
211       continue;
212     }
213     quality = 100 * (stat->neighbors[i].quality/255);
214     elog(LOG_DEBUG(1), "sim-link: %f, reported: %d, diff: %f\n", 
215       stat->neighbor_info[i].sim_link_quality, 
216       quality, 
217       abs_float(stat->neighbor_info[i].sim_link_quality - 
218         quality));
219   }
220 }
221 */
222 
223 /* application-specific tests */
224 static int
225 check_insufficient_msgs(int msgs_have, int msgs_expected, int msg_reception_percent)
226 {
227  return ((msgs_have < (msgs_expected * msg_reception_percent)/100)
228      || ((msgs_have == 0) && (msgs_expected > 0)));
229 }
230 
231 /* Check if node got requests */
232 static
233 int received_requests(sympathy_node_app_info_t* snode, buf_t* fault_buf)
234 {
235   if (check_insufficient_msgs(snode->node_num_pkts_rx.agg_prev_epoch, 
236         snode->sink_pkt_tx.agg_prev_epoch, snode->pkt_reception_percent))
237   {
238     bufprintf(fault_buf,"\t0x%x: Num reqs node rx/Num reqs sink tx: %d/%d\n", 
239         S_CODE(S_COMP_RX_REQS), snode->node_num_pkts_rx.agg_prev_epoch, 
240         snode->sink_pkt_tx.agg_prev_epoch);
241     return 0;
242   }
243   return 1;
244 }
245 
246 /* Compared to #pkts sink is expecting, is node sending 
247  * sufficient responses */
248 static
249 int comp_tx_data(sympathy_node_app_info_t* snode, buf_t* fault_buf)
250 {
251   if (check_insufficient_msgs(snode->node_num_pkts_tx.agg_prev_epoch,
252         snode->sink_pkt_expected_rx.agg_prev_epoch, snode->pkt_reception_percent))
253   {
254     bufprintf(fault_buf, 
255       "\tComp tx/ sink expected: %d/%d\n", 
256        snode->node_num_pkts_tx.agg_prev_epoch, 
257        snode->sink_pkt_expected_rx.agg_prev_epoch);
258     return 0;
259   }
260   return 1;
261 }
262 
263 static
264 int node_tx_metrics(sympathy_node_info_t* stat)
265 {
266   return (stat->num_metrics_tx.agg_prev_epoch > 0);
267 }
268 
269 int received_data(stats_ctr_t* pkts_rx, char* type, buf_t* fault_buf)
270 {
271   //  NR why dont this work??
272 //  int x = get_minutes_since_event(&pkts_rx->last_updated);
273 //  if (x >= (EPOCH_MSEC/60000)) {
274   if (pkts_rx->agg_prev_epoch == 0) {
275     bufprintf(fault_buf, "\t%s: Num pkts rx: %d(%d)\n", 
276         type, pkts_rx->ctr, pkts_rx->agg_prev_epoch);
277     return 0;
278   }
279   return 1;
280 }
281 
282 /* Check how much data node tx compared to requests rx */
283 static
284 int receiving_data_node_tx(stats_ctr_t* pkts_sink_rx, 
285     stats_ctr_t* pkts_node_tx, buf_t* fault_buf, int pkt_reception_percent)
286 {
287   if (check_insufficient_msgs(pkts_sink_rx->agg_prev_epoch, 
288         pkts_node_tx->agg_prev_epoch, pkt_reception_percent))
289   {
290     bufprintf(fault_buf, 
291       "\tsink rx pkts/Node tx pkts: %d/%d\n", 
292       pkts_sink_rx->agg_prev_epoch, pkts_node_tx->agg_prev_epoch);
293     return 0;
294   }
295   return 1;
296 }
297 
298 static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t* 
299     good_rx, buf_t* fault_buf)
300 {
301   if ((errors_rx->agg_prev_epoch > 0) 
302      && (errors_rx->agg_prev_epoch 
303        >= (good_rx->agg_prev_epoch*PERCENT_GOOD_PACKETS_CONGESTION)/100))
304   {
305     bufprintf(fault_buf, "\tRx bad/good %d/%d\n",
306       errors_rx->agg_prev_epoch, good_rx->agg_prev_epoch);
307     return 1;
308   }
309   return 0;
310 }
311 
312 static
313 int received_sufficient_data(stats_ctr_t* pkts_rx, 
314     stats_ctr_t* expected_pkts_rx, buf_t* fault_buf, int pkt_reception_percent)
315 {
316   if (check_insufficient_msgs(pkts_rx->agg_prev_epoch, 
317         expected_pkts_rx->agg_prev_epoch, pkt_reception_percent))
318   {
319     bufprintf(fault_buf, "\tSink rx/Expected to rx: %d/%d\n",
320           pkts_rx->agg_prev_epoch, expected_pkts_rx->agg_prev_epoch);
321     elog(LOG_DEBUG(1), "CHECK fault-buf: %s\n", fault_buf->buf);
322     return 0;
323   }
324 
325   return 1;
326 }
327 
328 /* For this test, we have to measure the exact time since
329  * the sink received a packet from the node */
330 static int received_some_pkts_from_node(sympathy_node_info_t* stat) 
331 {
332   return event_valid(&stat->packet.last_updated);
333   //int x = get_minutes_since_event(&stat->packet.last_updated);
334   //elog(LOG_DEBUG(1), "CHECK mins-since rx packet: %x\n",
335      //x);
336   //if (x >= (EPOCH_MSEC/60000)) return 0;
337   //return 1;
338 }
339 
340 #ifdef USE_BAYES
341 int received_non_symp_app_pkts_from_node(sympathy_node_info_t* stat)
342 {
343   return (stat->tos_packets[SNON_ROUTING_PKT] > 0);
344 }
345 #endif
346 
347 /**** General Testing Framework ***/
348 static
349 void find_nodes_with_same_next_hop(sympathy_node_info_t* stat)
350 {
351   int i;
352 
353   stat->num_with_same_next_hop = 0;
354 
355         /* Find other nodes with the same next-hop as this node */
356         for (i = 0; i < sink.num_srcs; i++)
357         {
358                 if (sink.status_srcs[i].addr != stat->addr) 
359     {
360       if ((stat->next_hop.next_hop > 0)
361         && (sink.status_srcs[i].next_hop.next_hop == stat->next_hop.next_hop)
362           && (sink.status_srcs[i].next_hop.sink == stat->next_hop.sink)
363           && (route_valid(&sink.status_srcs[i])))
364       {
365         bufprintf(stat->topology_info, ", %d ", sink.status_srcs[i].addr);
366         stat->nodes_with_same_next_hop[stat->num_with_same_next_hop] = 
367           sink.status_srcs[i].addr;
368         if (sink.status_srcs[i].failure_type > SFL_OK) {
369           bufprintf(stat->topology_info, "(root-cause=%s)", 
370               decode_root_cause(sink.status_srcs[i].failure_type,
371                 sink.status_srcs[i].addr));
372         }
373         stat->num_with_same_next_hop++;
374       }
375     }
376   }
377 
378   if (stat->num_with_same_next_hop) {
379     bufprintf(stat->topology_info, "have the same next-hop(%d)!\n",
380        stat->next_hop.next_hop);
381   }
382 }
383 
384 /*** Global Functions ***/
385 
386 int check_passed(int error_events, int code) 
387 {
388   return((error_events & S_CODE(code)) == 0);
389 }
390 
391 int call_track_lost_nodes(void* data, int interval, g_event_t* event)
392 {
393   int i;
394   track_lost_nodes();
395 
396   /* Increment the window, and then clear all the counters for this current
397    * window */
398   inc_mod((uint16_t *) &sink.window, 1, TRACK_FAILURE_WINDOW_SIZE);
399         sink.metric_pd++;
400 
401   /* Have to update_counters AFTER incrementing sink.window! */
402   for (i = 0; i < sink.num_srcs; i++) 
403   {
404     update_counters(&sink.status_srcs[i], 1);
405     if (sink.status_srcs[i].failure_type > SFL_OK) {
406       sympathy_emview_text(&sink.status_srcs[i]);
407     }
408   }
409   g_status_dev_notify(sink.metrics_status);
410         return EVENT_RENEW;
411 }
412 
413 /* These are in no specific order */
414 static
415 void step1_run_tests(sympathy_node_info_t* stat)
416 {
417   if (!route_to_sink(stat)) {
418     log_failed(&stat->error_events,S_ROUTE_TO_SINK);
419   }
420 
421   if (!node_has_neighbors(stat)) {
422     log_failed(&stat->error_events,S_NEIGHBORS);
423   }
424   
425   /* Check if node received mostly good packets from other nodes */
426   if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
427     log_failed(&stat->error_events, S_NO_COLLISIONS);
428     stat->congestion_detected = 1;
429   }
430 
431   if (!received_data(&stat->metrics_rx, "data", stat->fault_buf)) {
432     log_failed(&stat->error_events, S_RX_DATA_THIS_PD);
433   }
434 
435   if (!received_sufficient_data(&stat->metrics_rx, 
436         &sink.expected_num_sympathy_metrics, stat->fault_buf, 
437         SMSG_RECEPTION_THRESH_DEFAULT)) {
438     elog(LOG_DEBUG(1), "CHECK node %d didnt rx suff data!\n", 
439         stat->addr);
440     log_failed(&stat->error_events, S_RX_SUFFICIENT_DATA);
441   }
442 
443   if (!received_data(&stat->sympathy_stats_rx, "stats", stat->fault_buf)) {
444     log_failed(&stat->error_events, S_RX_STATS);
445   }
446 
447   if (!node_tx_metrics(stat)) log_failed(&stat->error_events, S_COMP_TX_DATA);
448 
449   if (!receiving_data_node_tx(&stat->sympathy_stats_rx, 
450         &stat->num_stats_tx, stat->fault_buf, SMSG_RECEPTION_THRESH_DEFAULT))
451   {
452     log_failed(&stat->error_events,S_RX_STATS_COMP_TX);
453   }
454   if (!received_some_pkts_from_node(stat))
455   {
456     log_failed(&stat->error_events,S_RX_SOME_PKTS_FROM_NODE);
457   }
458   if (!node_heard_from(stat))
459   {
460     log_failed(&stat->error_events,S_NODE_HEARD_FROM);
461   }
462 }
463 
464 static void step1_check_component(sympathy_node_app_info_t* snode)
465 {
466   /* If we haven't received app metrics, then we cant categorize! */
467   if (!received_data(&snode->app_stats_rx_from_node, "stats", snode->fault_buf)) 
468   {
469     log_failed(&snode->error_events, S_RX_STATS);
470   }
471   if (!received_requests(snode, snode->fault_buf)) 
472   {
473     log_failed(&snode->error_events, S_COMP_RX_REQS);
474   }
475   if (!comp_tx_data(snode, snode->fault_buf)) 
476   {
477     log_failed(&snode->error_events, S_COMP_TX_DATA);
478   }
479   if (!received_sufficient_data(&snode->sink_pkt_rx, 
480         &snode->sink_pkt_expected_rx, snode->fault_buf, snode->pkt_reception_percent))
481   {
482     log_failed(&snode->error_events, S_RX_SUFFICIENT_DATA);
483   }
484 
485   if (!received_data(&snode->sink_pkt_rx, "data", snode->fault_buf)) 
486   {
487     log_failed(&snode->error_events, S_RX_DATA_THIS_PD);
488   }
489 }
490 
491 // Check for failures as much as is possible - so we use the receipt
492 // of metrics to determine if we have received sufficient data from
493 // a node. We will then go on to check the components.
494 static
495 int step2_set_failure(int error_events)
496 {
497   if (!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE)) {
498     return SFL_NO_DATA;
499   }
500   else if (!check_passed(error_events, S_RX_SUFFICIENT_DATA)) {
501     return SFL_INSUFFICIENT_DATA;
502   }
503   return SFL_OK;
504 }
505 
506 /* Returns failure for system, and sets root-cause */
507 static
508 int step3_root_cause_system_failure(sympathy_node_info_t* stat)
509 {
510   int failure = SFL_NO_DATA;
511   check_sink_collisions();
512 
513   if (!node_has_neighbors(stat)) {
514     stat->failure_root_cause = SRC_NO_NEIGHBORS;
515   }
516   else if (!node_heard_from(stat)) {
517     stat->failure_root_cause = SRC_NOBODY_CLAIMS_SINK_AS_NEIGHBOR;
518   }
519   else if (!route_to_sink(stat)) {
520     stat->failure_root_cause = SRC_NO_ROUTE;
521   }
522   else failure = SFL_OK;
523   elog(LOG_DEBUG(1), "CHECK system failure = %d, root-cause: %d\n",
524       failure, stat->failure_root_cause);
525   return failure;
526 }
527 
528 static
529 int step3_root_cause_failure(sympathy_node_info_t* stat, int error_events, 
530     int test_events)
531 {
532   if (test_events) check_events(stat);
533 
534   if (stat->rebooted) return SRC_NODE_REBOOTED;
535 
536   else if ((!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE))
537        && (!check_passed(stat->error_events, S_NODE_HEARD_FROM))) {
538     return SRC_NODE_FAILED;
539   }
540 
541   else if (!check_passed(stat->error_events, S_NEIGHBORS)) {
542     return SRC_NO_NEIGHBORS;
543   }
544 
545   else if (!check_passed(stat->error_events, S_ROUTE_TO_SINK)) {
546     return SRC_NO_ROUTE;
547   }
548 
549   /* If the sink has'nt received statistics from this component,
550    * then sympathy can't do anything, and just assumes that the
551    * sink is not receiving data sent by the node. */
552   else if (!check_passed(error_events, S_RX_STATS)) {
553     return SRC_BAD_PATH_TO_SINK;
554   }
555 
556   /* maybe its beacuse the component is not receiving the
557    * requests */
558   else if (!check_passed(error_events, S_COMP_RX_REQS)) {
559     return SRC_BAD_PATH_TO_NODE;
560   }
561   /* OW because the node is not transmitting data in response */
562   else if (!check_passed(error_events, S_COMP_TX_DATA)) {
563    return SRC_BAD_NODE_TRANSMIT;
564   }
565 
566   return SRC_BAD_PATH_TO_SINK;
567 }
568 
569 /* This func is only called if the node has a failure to begin with */
570 static
571 void step4_localize_failure(sympathy_node_info_t* stat)
572 {
573   sympathy_node_info_t* curr_stat = NULL;
574   sympathy_node_info_t* sink_stat = find_status_ptr(my_node_id);
575   int iter = 0;
576 
577   stat->failure_localization = S_SELF;
578   find_nodes_with_same_next_hop(stat);
579 
580   /* If the sink has a failure, then all failures are localized
581    * to the sink - other than the sink's failure, which is localized to
582    * itself. */
583   if (stat->addr == my_node_id) {
584     stat->failure_localization = S_SELF;
585     return;
586   }
587 
588   /* If the node rebooted, nothing can explain that along the path */
589   else if (stat->rebooted) {
590     stat->failure_localization = S_SELF;
591     return;
592   }
593 
594   /* If the sink has a failure, or the node's failure is just a communication
595    * issue and the sink is experiencing congestion, then localize it to the
596    * sink */
597   else if ((sink_stat->failure_type > SFL_OK)
598             || ((stat->failure_root_cause < SRC_NO_NEIGHBORS)
599                 && (sink_stat->congestion_detected))) {
600      stat->failure_localization = S_SINK;
601      stat->source_node_failure = my_node_id;
602      return;
603    }
604 
605   /* Otherwise we try to find the source of the failure somewhere in the
606    * network. Even if the route is not valid, we still use it as an indicator
607    * for fault localization. This is a reasonable thing to do because the
608    * route is probably invalid as a result of the fault. */
609   //else if (route_valid(stat)) {
610   else {
611     curr_stat = stat;
612     while ((curr_stat = iter_next_hop(curr_stat, &iter, 
613              curr_stat->next_hop.sink))) {
614 
615       /* If this node along the path has no root-caused failure
616        * then it cannot be the source of the node's failure */
617       if (!(curr_stat->congestion_detected 
618             || (curr_stat->failure_type > SFL_OK))) continue;
619 
620       /* If we find a worse root-cause on a node closer to the sink 
621        * (worse defined by ordering of root-causes, then that node
622        * is the source of the current node's problems.
623        * There are some exceptions to this ordering.
624        * If the current node's failure is less critical than
625        * no-neighbors, then it can be explained by any failure
626        * downstream from it - NOT just one that is "worse" */
627       if ((curr_stat->failure_root_cause >= stat->failure_root_cause) 
628              || (stat->failure_root_cause < SRC_NO_NEIGHBORS)) {
629         stat->failure_localization = S_PATH;
630         stat->source_node_failure = curr_stat->addr;
631       }
632 
633       /* If the next hop isn't valid, we can't guage the rest of 
634        * the route, so we keep the current status */
635       if (!route_valid(curr_stat)) break;
636     }
637   }
638 
639   /* If failure localized to self, and there is congestion at this node,
640    * then we will localize it to the path, but specify the node as itself */
641   if (stat->failure_localization == S_SELF) {
642     if (stat->congestion_detected) {
643         stat->source_node_failure = stat->addr;
644         stat->failure_localization = S_PATH;
645     }
646   }
647 
648   return;
649 }
650 
651 void track_lost_nodes()
652 {
653   int i, tmp_root_cause, j;
654   sympathy_node_app_info_t* snode;
655   sympathy_node_info_t* stat;
656   int notify = 0;
657 
658   elog(LOG_DEBUG(1), "window: %d, metric-pd: %d\n", sink.window,
659       sink.metric_pd);
660 
661   /* Only include nodes whom we have heard from in the past
662    * epoch - in analysis of other nodes  */
663   for (i = 0; i < sink.num_srcs; i++)
664   {
665     stat = &sink.status_srcs[i];
666 
667     /* Calculate values for last-epoch */
668     update_counters(stat, 0);
669 
670     /* Clear previous values, which are stored until next calculation */
671     stat->error_events = 0;
672     clear_buf(&stat->fault_buf);
673     clear_buf(&stat->topology_info);
674 
675     if (!received_data(&stat->metrics_rx, "data", NULL))  {
676       stat->metrics_valid = 0;
677     }
678     else stat->metrics_valid = 1;
679     stat->congestion_detected = 0;
680   }
681 
682   /* Don't begin checking for failures until we have had 
683    * TRACK_FAILURE_WINDOW_SIZE metrics periods */
684   if (sink.metric_pd < TRACK_FAILURE_WINDOW_SIZE) return;
685 
686   /* Set the failure category for all nodes */
687   for (i = 0; i < sink.num_srcs; i++)
688   {
689     stat = &sink.status_srcs[i];
690     tmp_root_cause = stat->failure_root_cause;
691 
692     /* If the node is a sink, then look for system wide failures because the
693      * sink itself won't have any failures we want to detect */
694     if (stat->addr == my_node_id)
695     {
696       stat->failure_type = step3_root_cause_system_failure(stat);
697     }
698 
699     /* Otherwise check failures on the node, sympathy and remaining components */
700     else
701     {
702       step1_run_tests(stat);
703 
704       // NR make sympathy one of the components and check it as such?
705       stat->failure_type = step2_set_failure(stat->error_events);
706 
707       /* Then check metrics for each application registered with sympathy */
708       for (j = 0; j < sink.num_apps_registered; j++) {
709         snode = &stat->node_app_info[j];
710         snode->error_events = 0;
711         clear_buf(&snode->fault_buf);
712         
713         /* Run tests on component */
714         step1_check_component(snode);
715         snode->failure_type = step2_set_failure(snode->error_events);
716 
717         /* If we have a failure from this node, then try to root-cause it */
718         if (snode->failure_type > SFL_OK) {
719           step3_root_cause_failure(stat, snode->error_events, 0);
720         }
721 
722         /* If the current failure assignment for the node is an OK, and a component
723          * has a failure, then we set the current failure assignment
724          * to Insufficient data */
725         if ((stat->failure_type == SFL_OK) && (snode->failure_type > SFL_OK)) {
726           stat->failure_type = SFL_INSUFFICIENT_DATA;
727           stat->failure_root_cause = snode->failure_root_cause;
728         }
729       }
730 
731       /* If we have a failure from this node, then try to root-cause it */
732       if (stat->failure_type > SFL_OK) {
733         stat->failure_root_cause = 
734           step3_root_cause_failure(stat, stat->error_events, 1);
735         elog(LOG_DEBUG(1), "CHECK node %d had failure: %d, root-cause: %d\n", 
736             stat->addr, stat->failure_type, stat->failure_root_cause);
737       }
738     }
739 
740     /* If this root-cause is new, we note it and notify devices */
741     if (stat->failure_type > SFL_OK) {
742       if (tmp_root_cause != stat->failure_root_cause) {
743         stat->period_root_caused = sink.metric_pd;
744         notify = 1;
745       }
746     }
747   }
748 
749   if (notify) {
750     g_status_dev_notify(sink.summary_status);
751     g_status_dev_notify(sink.fail_status);
752   }
753 
754 #ifdef USE_BAYES
755   /* create bayes network based on routes */
756   bayes_classify_network(my_node_id);
757 #endif
758 
759   /* Clear the agg_prev_epoch values, and update counters */
760   stats_ctr_update(&sink.expected_num_sympathy_metrics, 0, 0);
761   stats_ctr_update(&sink.expected_num_sympathy_metrics, 1, 0);
762 
763   /* Diagnose the failure - either caused by congestion along
764    * the path, or what? */
765   for (j = 0; j < sink.num_srcs; j++) {
766     stat = &sink.status_srcs[j];
767     if (stat->failure_type > SFL_OK) step4_localize_failure(stat);
768   }
769 }
770 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.