(file) Return to sympathy_analyze.c CVS log (file) Jump to this file's LXR Page (dir) Up to [CENS] / emstar / devel / sympathy_devel

File: [CENS] / emstar / devel / sympathy_devel / sympathy_analyze.c (download) / (as text)
Revision: 1.64, Thu Jul 20 19:30:32 2006 UTC (3 years, 4 months ago) by nithya
Branch: MAIN
CVS Tags: pregeonet, PRE_TOSNIC_FIX, PRE_64BIT, HEAD, CYCLOPS_RELEASE_CANDIDATE_2_0, CYCLOPS_PRERELEASE_STABLE, CENTROUTE_EMSTAR_SOCKETS, AMARSS_JR_DEPLOYMENT_6_05_07
Changes since 1.63: +5 -19 lines
Bug fixes

/* ex: set tabstop=2 expandtab shiftwidth=2 softtabstop=2: */
/*
 *
 * Copyright (c) 2003 The Regents of the University of California.  All 
 * rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Neither the name of the University nor the names of its
 *   contributors may be used to endorse or promote products derived
 *   from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

 /*
  *
  * Author: Nithya Ramanthan
  *
  */

/* NR todo: fix #root-detections, can sometimes use
 * outdated routes - dont just specify a node is a root
 * just because its route is outdated! */

#include <sympathy.h>
#include <sim/radio.h>

static void check_sink_collisions();

static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t* 
    good_rx, buf_t* fault_buf);

/* For now just check if the next-hop is in the
 * neighbor list */
static
void check_events(sympathy_node_info_t* stat)
{
  if (stat->next_hop.next_hop == 0) return;

  /* See if the next-hop selected is in the neighbor list */
  if ((find_neighbor(stat, (Saddr_t) stat->next_hop.next_hop))< 0) {
    bufprintf(stat->fault_buf, "ERROR: Next-hop: %d is not a neighbor!\n", stat->next_hop.next_hop);
  }
}

static
void log_failed(int* error_events, int code)
{
  *error_events |= S_CODE(code);
}

static 
int node_has_neighbors(sympathy_node_info_t* stat)
{
  return (stat->num_neighbors > 0);
}

static 
int route_to_sink(sympathy_node_info_t* stat)
{
  uint8_t i;

  /* If this is the sink, see if anybody has a route to the sink */
  if (stat->addr == my_node_id) {
    for (i = 0; i < sink.num_srcs; i++) {
      if (route_valid(&sink.status_srcs[i])) {
        if (sink.status_srcs[i].next_hop.sink == my_node_id) return 1;
      }
    }
  }

  /* Otherwise just check this node */
  else if (route_valid(stat))
  {
    return(stat->next_hop.sink == my_node_id);
  }

  return 0;
}

/* Node is claimed to have not been heard from IF: no nodes have it on their 
 * neighbor list AND they have current metrics */
static
int node_heard_from(sympathy_node_info_t* stat)
{
  uint8_t i, j;
  buf_t* neighbor_buf = buf_new();

  stat->num_heard_this_node = 0;

  /* Parse metrics to see if anybody claims node as a neighbor */
  for (i = 0; i < sink.num_srcs; i++)
  {
    if (sink.status_srcs[i].addr != stat->addr) 
    {
      for (j = 0; j < sink.status_srcs[i].num_neighbors; j++)
      {
	      if (sink.status_srcs[i].neighbors[j].node_id 
						== stat->addr) 
        {
          if (neighbors_valid(&sink.status_srcs[i]))
          {
            stat->num_heard_this_node++;
            bufprintf(neighbor_buf, "%d,", sink.status_srcs[i].addr); 
          }
        }
      }
    }
  }
#ifdef USE_BAYES
  if (stat->num_heard_this_node > stat->max_num_heard_this_node)
    stat->max_num_heard_this_node = stat->num_heard_this_node;
#endif

  if (neighbor_buf->len > 0) {
    bufprintf(stat->topology_info, "Num neighbors heard this node: %d {%s}\n", 
        stat->num_heard_this_node, neighbor_buf->buf);
  }
  else bufprintf(stat->topology_info, "NO NEIGHBORS heard this node!\n");
  buf_free(neighbor_buf);
  return (stat->num_heard_this_node > 0);
}


static 
void check_sink_collisions()
{
  sympathy_node_info_t *stat = find_status_ptr(my_node_id);
  if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
    log_failed(&stat->error_events, S_NO_COLLISIONS);
    stat->congestion_detected = 1;
  }
}

static void update_counters(sympathy_node_info_t* stat, uint8_t clear)
{
  int j;
  sympathy_node_app_info_t* snode;

  stats_ctr_update(&stat->metrics_rx,clear, 0);
  stats_ctr_update(&stat->sympathy_stats_rx,clear, 0);

  for (j = 0; j < NUM_TOS_PKT_TYPES; j++)
  {
    stats_ctr_update(&stat->tos_packets[j],clear, 0);
  }
  stats_ctr_update(&stat->errs_rx,clear, 0);

  for (j = 0; j < sink.num_apps_registered; j++)
  {
    snode = &stat->node_app_info[j];
    stats_ctr_update(&snode->node_num_pkts_rx,clear, 0);
    stats_ctr_update(&snode->node_send_failures,clear, 0); 
    stats_ctr_update(&snode->node_max_queue_occupancy,clear, 0);
    stats_ctr_update(&snode->node_num_pkts_dropped,clear, 0);
    stats_ctr_update(&snode->node_num_pkts_tx,clear, 1);
    stats_ctr_update(&snode->app_stats_rx_from_node,clear, 0);
    stats_ctr_update(&snode->sink_pkt_tx,clear, 0);
    stats_ctr_update(&snode->sink_pkt_rx,clear, 0);
    stats_ctr_update(&snode->sink_pkt_expected_rx,clear, 0);
  }
  stats_ctr_update(&stat->time_awake_mins,clear, 0);
  stats_ctr_update(&stat->num_metrics_tx,clear, 1);
  stats_ctr_update(&stat->num_stats_tx,clear, 1);
  stats_ctr_update(&stat->num_pkts_tx_succeeded,clear, 0);
  stats_ctr_update(&stat->num_pkts_rx,clear, 0);
  stats_ctr_update(&stat->num_pkts_dropped,clear, 0);
  stats_ctr_update(&stat->num_pkts_tx_failed,clear, 0);
  stats_ctr_update(&stat->num_pkts_crc_error,clear, 0);

  if (clear) stat->rebooted = 0;
}

void clear_buf(buf_t** buf)
{
  buf_free(*buf);
  (*buf) = buf_new();
}


/**** Running Tests ****/

/*
static void
compare_link_quality(sympathy_node_info_t* stat)
{
  float quality;
  int i;

  for (i = 0; i < stat->num_neighbors; i++) {
    if (stat->neighbor_info[i].sim_link_quality < 0) {
      elog(LOG_ERR, "ERROR Couldnt get simulation link quality for nodes %d -> %d\n", 
        stat->addr, stat->neighbors[i].node_id);
      continue;
    }
    quality = 100 * (stat->neighbors[i].quality/255);
    elog(LOG_DEBUG(1), "sim-link: %f, reported: %d, diff: %f\n", 
      stat->neighbor_info[i].sim_link_quality, 
      quality, 
      abs_float(stat->neighbor_info[i].sim_link_quality - 
        quality));
  }
}
*/

/* application-specific tests */
static int
check_insufficient_msgs(int msgs_have, int msgs_expected, int msg_reception_percent)
{
 return ((msgs_have < (msgs_expected * msg_reception_percent)/100)
     || ((msgs_have == 0) && (msgs_expected > 0)));
}

/* Check if node got requests */
static
int received_requests(sympathy_node_app_info_t* snode, buf_t* fault_buf)
{
  if (check_insufficient_msgs(snode->node_num_pkts_rx.agg_prev_epoch, 
        snode->sink_pkt_tx.agg_prev_epoch, snode->pkt_reception_percent))
  {
    bufprintf(fault_buf,"\t0x%x: Num reqs node rx/Num reqs sink tx: %d/%d\n", 
        S_CODE(S_COMP_RX_REQS), snode->node_num_pkts_rx.agg_prev_epoch, 
        snode->sink_pkt_tx.agg_prev_epoch);
    return 0;
  }
  return 1;
}

/* Compared to #pkts sink is expecting, is node sending 
 * sufficient responses */
static
int comp_tx_data(sympathy_node_app_info_t* snode, buf_t* fault_buf)
{
  if (check_insufficient_msgs(snode->node_num_pkts_tx.agg_prev_epoch,
        snode->sink_pkt_expected_rx.agg_prev_epoch, snode->pkt_reception_percent))
  {
    bufprintf(fault_buf, 
      "\tComp tx/ sink expected: %d/%d\n", 
       snode->node_num_pkts_tx.agg_prev_epoch, 
       snode->sink_pkt_expected_rx.agg_prev_epoch);
    return 0;
  }
  return 1;
}

static
int node_tx_metrics(sympathy_node_info_t* stat)
{
  return (stat->num_metrics_tx.agg_prev_epoch > 0);
}

int received_data(stats_ctr_t* pkts_rx, char* type, buf_t* fault_buf)
{
  //  NR why dont this work??
//  int x = get_minutes_since_event(&pkts_rx->last_updated);
//  if (x >= (EPOCH_MSEC/60000)) {
  if (pkts_rx->agg_prev_epoch == 0) {
    bufprintf(fault_buf, "\t%s: Num pkts rx: %d(%d)\n", 
        type, pkts_rx->ctr, pkts_rx->agg_prev_epoch);
    return 0;
  }
  return 1;
}

/* Check how much data node tx compared to requests rx */
static
int receiving_data_node_tx(stats_ctr_t* pkts_sink_rx, 
    stats_ctr_t* pkts_node_tx, buf_t* fault_buf, int pkt_reception_percent)
{
  if (check_insufficient_msgs(pkts_sink_rx->agg_prev_epoch, 
        pkts_node_tx->agg_prev_epoch, pkt_reception_percent))
  {
    bufprintf(fault_buf, 
      "\tsink rx pkts/Node tx pkts: %d/%d\n", 
      pkts_sink_rx->agg_prev_epoch, pkts_node_tx->agg_prev_epoch);
    return 0;
  }
  return 1;
}

static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t* 
    good_rx, buf_t* fault_buf)
{
  if ((errors_rx->agg_prev_epoch > 0) 
     && (errors_rx->agg_prev_epoch 
       >= (good_rx->agg_prev_epoch*PERCENT_GOOD_PACKETS_CONGESTION)/100))
  {
    bufprintf(fault_buf, "\tRx bad/good %d/%d\n",
      errors_rx->agg_prev_epoch, good_rx->agg_prev_epoch);
    return 1;
  }
  return 0;
}

static
int received_sufficient_data(stats_ctr_t* pkts_rx, 
    stats_ctr_t* expected_pkts_rx, buf_t* fault_buf, int pkt_reception_percent)
{
  if (check_insufficient_msgs(pkts_rx->agg_prev_epoch, 
        expected_pkts_rx->agg_prev_epoch, pkt_reception_percent))
  {
    bufprintf(fault_buf, "\tSink rx/Expected to rx: %d/%d\n",
          pkts_rx->agg_prev_epoch, expected_pkts_rx->agg_prev_epoch);
    elog(LOG_DEBUG(1), "CHECK fault-buf: %s\n", fault_buf->buf);
    return 0;
  }

  return 1;
}

/* For this test, we have to measure the exact time since
 * the sink received a packet from the node */
static int received_some_pkts_from_node(sympathy_node_info_t* stat) 
{
  return event_valid(&stat->packet.last_updated);
  //int x = get_minutes_since_event(&stat->packet.last_updated);
  //elog(LOG_DEBUG(1), "CHECK mins-since rx packet: %x\n",
     //x);
  //if (x >= (EPOCH_MSEC/60000)) return 0;
  //return 1;
}

#ifdef USE_BAYES
int received_non_symp_app_pkts_from_node(sympathy_node_info_t* stat)
{
  return (stat->tos_packets[SNON_ROUTING_PKT] > 0);
}
#endif

/**** General Testing Framework ***/
static
void find_nodes_with_same_next_hop(sympathy_node_info_t* stat)
{
  int i;

  stat->num_with_same_next_hop = 0;

	/* Find other nodes with the same next-hop as this node */
	for (i = 0; i < sink.num_srcs; i++)
	{
		if (sink.status_srcs[i].addr != stat->addr) 
    {
      if ((stat->next_hop.next_hop > 0)
        && (sink.status_srcs[i].next_hop.next_hop == stat->next_hop.next_hop)
          && (sink.status_srcs[i].next_hop.sink == stat->next_hop.sink)
          && (route_valid(&sink.status_srcs[i])))
      {
        bufprintf(stat->topology_info, ", %d ", sink.status_srcs[i].addr);
        stat->nodes_with_same_next_hop[stat->num_with_same_next_hop] = 
          sink.status_srcs[i].addr;
        if (sink.status_srcs[i].failure_type > SFL_OK) {
          bufprintf(stat->topology_info, "(root-cause=%s)", 
              decode_root_cause(sink.status_srcs[i].failure_type,
                sink.status_srcs[i].addr));
        }
        stat->num_with_same_next_hop++;
      }
    }
  }

  if (stat->num_with_same_next_hop) {
    bufprintf(stat->topology_info, "have the same next-hop(%d)!\n",
       stat->next_hop.next_hop);
  }
}

/*** Global Functions ***/

int check_passed(int error_events, int code) 
{
  return((error_events & S_CODE(code)) == 0);
}

int call_track_lost_nodes(void* data, int interval, g_event_t* event)
{
  int i;
  track_lost_nodes();

  /* Increment the window, and then clear all the counters for this current
   * window */
  inc_mod((uint16_t *) &sink.window, 1, TRACK_FAILURE_WINDOW_SIZE);
	sink.metric_pd++;

  /* Have to update_counters AFTER incrementing sink.window! */
  for (i = 0; i < sink.num_srcs; i++) 
  {
    update_counters(&sink.status_srcs[i], 1);
    if (sink.status_srcs[i].failure_type > SFL_OK) {
      sympathy_emview_text(&sink.status_srcs[i]);
    }
  }
  g_status_dev_notify(sink.metrics_status);
	return EVENT_RENEW;
}

/* These are in no specific order */
static
void step1_run_tests(sympathy_node_info_t* stat)
{
  if (!route_to_sink(stat)) {
    log_failed(&stat->error_events,S_ROUTE_TO_SINK);
  }

  if (!node_has_neighbors(stat)) {
    log_failed(&stat->error_events,S_NEIGHBORS);
  }
  
  /* Check if node received mostly good packets from other nodes */
  if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
    log_failed(&stat->error_events, S_NO_COLLISIONS);
    stat->congestion_detected = 1;
  }

  if (!received_data(&stat->metrics_rx, "data", stat->fault_buf)) {
    log_failed(&stat->error_events, S_RX_DATA_THIS_PD);
  }

  if (!received_sufficient_data(&stat->metrics_rx, 
        &sink.expected_num_sympathy_metrics, stat->fault_buf, 
        SMSG_RECEPTION_THRESH_DEFAULT)) {
    elog(LOG_DEBUG(1), "CHECK node %d didnt rx suff data!\n", 
        stat->addr);
    log_failed(&stat->error_events, S_RX_SUFFICIENT_DATA);
  }

  if (!received_data(&stat->sympathy_stats_rx, "stats", stat->fault_buf)) {
    log_failed(&stat->error_events, S_RX_STATS);
  }

  if (!node_tx_metrics(stat)) log_failed(&stat->error_events, S_COMP_TX_DATA);

  if (!receiving_data_node_tx(&stat->sympathy_stats_rx, 
        &stat->num_stats_tx, stat->fault_buf, SMSG_RECEPTION_THRESH_DEFAULT))
  {
    log_failed(&stat->error_events,S_RX_STATS_COMP_TX);
  }
  if (!received_some_pkts_from_node(stat))
  {
    log_failed(&stat->error_events,S_RX_SOME_PKTS_FROM_NODE);
  }
  if (!node_heard_from(stat))
  {
    log_failed(&stat->error_events,S_NODE_HEARD_FROM);
  }
}

static void step1_check_component(sympathy_node_app_info_t* snode)
{
  /* If we haven't received app metrics, then we cant categorize! */
  if (!received_data(&snode->app_stats_rx_from_node, "stats", snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_RX_STATS);
  }
  if (!received_requests(snode, snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_COMP_RX_REQS);
  }
  if (!comp_tx_data(snode, snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_COMP_TX_DATA);
  }
  if (!received_sufficient_data(&snode->sink_pkt_rx, 
        &snode->sink_pkt_expected_rx, snode->fault_buf, snode->pkt_reception_percent))
  {
    log_failed(&snode->error_events, S_RX_SUFFICIENT_DATA);
  }

  if (!received_data(&snode->sink_pkt_rx, "data", snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_RX_DATA_THIS_PD);
  }
}

// Check for failures as much as is possible - so we use the receipt
// of metrics to determine if we have received sufficient data from
// a node. We will then go on to check the components.
static
int step2_set_failure(int error_events)
{
  if (!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE)) {
    return SFL_NO_DATA;
  }
  else if (!check_passed(error_events, S_RX_SUFFICIENT_DATA)) {
    return SFL_INSUFFICIENT_DATA;
  }
  return SFL_OK;
}

/* Returns failure for system, and sets root-cause */
static
int step3_root_cause_system_failure(sympathy_node_info_t* stat)
{
  int failure = SFL_NO_DATA;
  check_sink_collisions();

  if (!node_has_neighbors(stat)) {
    stat->failure_root_cause = SRC_NO_NEIGHBORS;
  }
  else if (!node_heard_from(stat)) {
    stat->failure_root_cause = SRC_NOBODY_CLAIMS_SINK_AS_NEIGHBOR;
  }
  else if (!route_to_sink(stat)) {
    stat->failure_root_cause = SRC_NO_ROUTE;
  }
  else failure = SFL_OK;
  elog(LOG_DEBUG(1), "CHECK system failure = %d, root-cause: %d\n",
      failure, stat->failure_root_cause);
  return failure;
}

static
int step3_root_cause_failure(sympathy_node_info_t* stat, int error_events, 
    int test_events)
{
  if (test_events) check_events(stat);

  if (stat->rebooted) return SRC_NODE_REBOOTED;

  else if ((!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE))
       && (!check_passed(stat->error_events, S_NODE_HEARD_FROM))) {
    return SRC_NODE_FAILED;
  }

  else if (!check_passed(stat->error_events, S_NEIGHBORS)) {
    return SRC_NO_NEIGHBORS;
  }

  else if (!check_passed(stat->error_events, S_ROUTE_TO_SINK)) {
    return SRC_NO_ROUTE;
  }

  /* If the sink has'nt received statistics from this component,
   * then sympathy can't do anything, and just assumes that the
   * sink is not receiving data sent by the node. */
  else if (!check_passed(error_events, S_RX_STATS)) {
    return SRC_BAD_PATH_TO_SINK;
  }

  /* maybe its beacuse the component is not receiving the
   * requests */
  else if (!check_passed(error_events, S_COMP_RX_REQS)) {
    return SRC_BAD_PATH_TO_NODE;
  }
  /* OW because the node is not transmitting data in response */
  else if (!check_passed(error_events, S_COMP_TX_DATA)) {
   return SRC_BAD_NODE_TRANSMIT;
  }

  return SRC_BAD_PATH_TO_SINK;
}

/* This func is only called if the node has a failure to begin with */
static
void step4_localize_failure(sympathy_node_info_t* stat)
{
  sympathy_node_info_t* curr_stat = NULL;
  sympathy_node_info_t* sink_stat = find_status_ptr(my_node_id);
  int iter = 0;

  stat->failure_localization = S_SELF;
  find_nodes_with_same_next_hop(stat);

  /* If the sink has a failure, then all failures are localized
   * to the sink - other than the sink's failure, which is localized to
   * itself. */
  if (stat->addr == my_node_id) {
    stat->failure_localization = S_SELF;
    return;
  }

  /* If the node rebooted, nothing can explain that along the path */
  else if (stat->rebooted) {
    stat->failure_localization = S_SELF;
    return;
  }

  /* If the sink has a failure, or the node's failure is just a communication
   * issue and the sink is experiencing congestion, then localize it to the
   * sink */
  else if ((sink_stat->failure_type > SFL_OK)
            || ((stat->failure_root_cause < SRC_NO_NEIGHBORS)
                && (sink_stat->congestion_detected))) {
     stat->failure_localization = S_SINK;
     stat->source_node_failure = my_node_id;
     return;
   }

  /* Otherwise we try to find the source of the failure somewhere in the
   * network. Even if the route is not valid, we still use it as an indicator
   * for fault localization. This is a reasonable thing to do because the
   * route is probably invalid as a result of the fault. */
  //else if (route_valid(stat)) {
  else {
    curr_stat = stat;
    while ((curr_stat = iter_next_hop(curr_stat, &iter, 
             curr_stat->next_hop.sink))) {

      /* If this node along the path has no root-caused failure
       * then it cannot be the source of the node's failure */
      if (!(curr_stat->congestion_detected 
            || (curr_stat->failure_type > SFL_OK))) continue;

      /* If we find a worse root-cause on a node closer to the sink 
       * (worse defined by ordering of root-causes, then that node
       * is the source of the current node's problems.
       * There are some exceptions to this ordering.
       * If the current node's failure is less critical than
       * no-neighbors, then it can be explained by any failure
       * downstream from it - NOT just one that is "worse" */
      if ((curr_stat->failure_root_cause >= stat->failure_root_cause) 
             || (stat->failure_root_cause < SRC_NO_NEIGHBORS)) {
        stat->failure_localization = S_PATH;
        stat->source_node_failure = curr_stat->addr;
      }

      /* If the next hop isn't valid, we can't guage the rest of 
       * the route, so we keep the current status */
      if (!route_valid(curr_stat)) break;
    }
  }

  /* If failure localized to self, and there is congestion at this node,
   * then we will localize it to the path, but specify the node as itself */
  if (stat->failure_localization == S_SELF) {
    if (stat->congestion_detected) {
        stat->source_node_failure = stat->addr;
        stat->failure_localization = S_PATH;
    }
  }

  return;
}

void track_lost_nodes()
{
  int i, tmp_root_cause, j;
  sympathy_node_app_info_t* snode;
  sympathy_node_info_t* stat;
  int notify = 0;

  elog(LOG_DEBUG(1), "window: %d, metric-pd: %d\n", sink.window,
      sink.metric_pd);

  /* Only include nodes whom we have heard from in the past
   * epoch - in analysis of other nodes  */
  for (i = 0; i < sink.num_srcs; i++)
  {
    stat = &sink.status_srcs[i];

    /* Calculate values for last-epoch */
    update_counters(stat, 0);

    /* Clear previous values, which are stored until next calculation */
    stat->error_events = 0;
    clear_buf(&stat->fault_buf);
    clear_buf(&stat->topology_info);

    if (!received_data(&stat->metrics_rx, "data", NULL))  {
      stat->metrics_valid = 0;
    }
    else stat->metrics_valid = 1;
    stat->congestion_detected = 0;
  }

  /* Don't begin checking for failures until we have had 
   * TRACK_FAILURE_WINDOW_SIZE metrics periods */
  if (sink.metric_pd < TRACK_FAILURE_WINDOW_SIZE) return;

  /* Set the failure category for all nodes */
  for (i = 0; i < sink.num_srcs; i++)
  {
    stat = &sink.status_srcs[i];
    tmp_root_cause = stat->failure_root_cause;

    /* If the node is a sink, then look for system wide failures because the
     * sink itself won't have any failures we want to detect */
    if (stat->addr == my_node_id)
    {
      stat->failure_type = step3_root_cause_system_failure(stat);
    }

    /* Otherwise check failures on the node, sympathy and remaining components */
    else
    {
      step1_run_tests(stat);

      // NR make sympathy one of the components and check it as such?
      stat->failure_type = step2_set_failure(stat->error_events);

      /* Then check metrics for each application registered with sympathy */
      for (j = 0; j < sink.num_apps_registered; j++) {
        snode = &stat->node_app_info[j];
        snode->error_events = 0;
        clear_buf(&snode->fault_buf);
        
        /* Run tests on component */
        step1_check_component(snode);
        snode->failure_type = step2_set_failure(snode->error_events);

        /* If we have a failure from this node, then try to root-cause it */
        if (snode->failure_type > SFL_OK) {
          step3_root_cause_failure(stat, snode->error_events, 0);
        }

        /* If the current failure assignment for the node is an OK, and a component
         * has a failure, then we set the current failure assignment
         * to Insufficient data */
        if ((stat->failure_type == SFL_OK) && (snode->failure_type > SFL_OK)) {
          stat->failure_type = SFL_INSUFFICIENT_DATA;
          stat->failure_root_cause = snode->failure_root_cause;
        }
      }

      /* If we have a failure from this node, then try to root-cause it */
      if (stat->failure_type > SFL_OK) {
        stat->failure_root_cause = 
          step3_root_cause_failure(stat, stat->error_events, 1);
        elog(LOG_DEBUG(1), "CHECK node %d had failure: %d, root-cause: %d\n", 
            stat->addr, stat->failure_type, stat->failure_root_cause);
      }
    }

    /* If this root-cause is new, we note it and notify devices */
    if (stat->failure_type > SFL_OK) {
      if (tmp_root_cause != stat->failure_root_cause) {
        stat->period_root_caused = sink.metric_pd;
        notify = 1;
      }
    }
  }

  if (notify) {
    g_status_dev_notify(sink.summary_status);
    g_status_dev_notify(sink.fail_status);
  }

#ifdef USE_BAYES
  /* create bayes network based on routes */
  bayes_classify_network(my_node_id);
#endif

  /* Clear the agg_prev_epoch values, and update counters */
  stats_ctr_update(&sink.expected_num_sympathy_metrics, 0, 0);
  stats_ctr_update(&sink.expected_num_sympathy_metrics, 1, 0);

  /* Diagnose the failure - either caused by congestion along
   * the path, or what? */
  for (j = 0; j < sink.num_srcs; j++) {
    stat = &sink.status_srcs[j];
    if (stat->failure_type > SFL_OK) step4_localize_failure(stat);
  }
}

CENS CVS Mailing List
Powered by
ViewCVS 0.9.2