#!/bin/ksh # # CMSGCStats.ksh - v1.0 # # Author : Gavin Satur - http://it.cybergav.in/downloads/CMSGCStats.ksh # Date : 7th January 2010 # Description : Quick-and-Dirty script for a basic summary of GC Activity for Hotspot JVM 1.4.2 using CMS GC. For HP JDKs, you can use the very comprehensive and sophisticated HPjmeter. Also, check out Sun's PrintGCStats (a nawk program) for Hotspot JVMs and Tagtraum's GCviewer. # You must enable -XX:+PrintGCDetails, -XX:+PrintGCTimeStamps and -Xloggc (log GC details to separate file rather than mix them with other details in stdout). # Tested on : GC logs for Hotspot JVM versions 1.4.2_10, 1.4.2_11, 1.4.2_19. Tests were successful, but the script could always balk for certain formats # of the GC log. Couldn't be bothered catering to the myriad of JVM flags and their output formats in GC logs [is it even possible? ;)]. # ###################################################################################################################### # # Determine Script Location # if [ -n "`dirname $0 | grep '^/'`" ]; then SCRIPT_LOCATION=`dirname $0` elif [ -n "`dirname $0 | grep '^..'`" ]; then cd `dirname $0` SCRIPT_LOCATION=$PWD cd - > /dev/null else SCRIPT_LOCATION=`print ${PWD}/\`dirname $0\` | sed 's#\/\.$##g'` fi SCRIPT_NAME=`basename $0` if [ ! -f ${SCRIPT_LOCATION}/${SCRIPT_NAME} ]; then print "\n`date '+%Y%m%d_%H%M'` : ERROR : Could not detect script location and/or name. Check and test script. Exiting...\n" exit 999 fi # # Variables, Functions and Validation # isNum() { case $1 in +([0-9]) ) ;; * ) printf no ;; esac } myUsage() { print "\n\nUSAGE: ksh CMSGCStats.ksh -F -H -N [[-d ]|[-h ]|[-m ]]\n" print "Example: To obtains CMSGCStats for myGCLog (for a JVM with -Xmx=2048m and -XX:MaxNewSize=768m) with additional stats for the LAST HOUR:\n" print "ksh CMSGCStats.ksh -F myGCLog -H 2048 -N 768 -h 1\n\n" exit 999 } [[ $# -eq 0 ]] && myUsage unset opt_string while getopts ":F:H:N:d:h:m:" opt; do case $opt in F ) if [ ! -s $OPTARG ]; then print "\nERROR: GC logfile is missing or empty.\n" myUsage else GC_LOG=$OPTARG opt_string="F" fi ;; H ) if [ "`isNum $OPTARG`" = "no" ]; then print "\nERROR: Invalid Heap size.\n" myUsage else xmx=$OPTARG opt_string="${opt_string}H" fi ;; N ) if [ "`isNum $OPTARG`" = "no" ]; then print "\nERROR: Invalid Newsize (YG).\n" myUsage else maxnewsize=$OPTARG opt_string="${opt_string}N" fi ;; d ) if [ "`isNum $OPTARG`" = "no" -o -z "$OPTARG" ]; then print "\nERROR: INVALID value for sample interval (days). Exiting..." exit 999 else TPUT_PERIOD="$OPTARG day(s)" TPUT_INTERVAL=$(( OPTARG * 24 * 60 * 60 )) fi ;; h ) if [ "`isNum $OPTARG`" = "no" ]; then print "\nERROR: INVALID value for sample interval (hours). Exiting..." exit 999 else TPUT_PERIOD="$OPTARG hour(s)" TPUT_INTERVAL=$(( OPTARG * 60 * 60 )) fi ;; m ) if [ "`isNum $OPTARG`" = "no" ]; then print "\nERROR: INVALID value for sample interval (mins). Exiting..." exit 999 else TPUT_PERIOD="$OPTARG minute(s)" TPUT_INTERVAL=$(( OPTARG * 60 )) fi ;; : ) print "\nERROR: Missing argument for -$OPTARG option" myUsage ;; \? ) myUsage ;; esac done shift $(($OPTIND - 1)) if [ "$opt_string" != "FHN" ]; then myUsage fi GC_FILE=${SCRIPT_LOCATION}/gctimes.$$ GC_LATEST_FILE=${SCRIPT_LOCATION}/gc_latesttimes.$$ LOG_FILE=${SCRIPT_LOCATION}/${SCRIPT_NAME%%.*}.log JVM_TPUT_THRESHOLD=95 CMF_THRESHOLD=1 if [ -s $LOG_FILE ]; then if [ `wc -c $LOG_FILE | nawk '{print $1}'` -ge 1048576 ]; then mv $LOG_FILE ${LOG_FILE}.old fi fi # # Main # # # Extract lines with GC times # egrep "\[GC|\[Full GC|concurrent mode failure" $GC_LOG > $GC_FILE if [ ! -s $GC_FILE ]; then print "\nERROR: Cannot extract lines with GC. Are you using -XX:+PrintGCDetails and -XX:+PrintGCTimeStamps JVM flags? Exiting...\n" [[ -f $GC_FILE ]] && rm $GC_FILE exit 999 fi # # ParNew GC stats # init_sec=`grep "^[0-9].*ParNew" $GC_FILE | head -1 | nawk -F: '{print $1}'` final_sec=`grep "^[0-9].*ParNew" $GC_FILE | tail -1 | nawk -F: '{print $1}'` par_count=`grep "^[0-9].*ParNew" $GC_FILE | wc -l` if [ $par_count -gt 0 ]; then par_int=`echo "scale=2; ($final_sec - $init_sec) / $par_count" | bc -l` [[ $par_int -eq 0 ]] && par_int="N/A" total_par_stw_dur=`sed -n '/\[ParNew.*\]/p' $GC_FILE | nawk -F: '{print $4}' | sed '/^$/d' | nawk 'BEGIN{count = 0}{count = count + $(NF -1)}END{print count}'` par_dur=`echo "scale=2; $total_par_stw_dur / $par_count" | bc -l` else par_int="N/A" par_dur="N/A" fi # # Serial Old GC (Mark-Sweep-Compact) stats # Note: Apart from explicitly using Serial Old GC, a concurrent mode failure indicates a Serial Old GC has been triggered. # init_sec=`grep "^[0-9].*\[Full GC" $GC_FILE | nawk -F: '{print $1}' | head -1` final_sec=`grep "^[0-9].*\[Full GC" $GC_FILE | nawk -F: '{print $1}' | tail -1` fgc_count=`egrep "\[Full GC|concurrent mode failure" $GC_FILE | grep -v "\[Full GC.*\[CMS[0-9].*: \[CMS-" | wc -l` if [ $fgc_count -gt 0 ]; then fgc_int=`echo "scale=2; ($final_sec - $init_sec) / $fgc_count" | bc -l` [[ $fgc_int -eq 0 ]] && fgc_int="N/A" total_fgc_stw_dur=`egrep "\[Full GC|concurrent mode failure" $GC_FILE | grep -v "\[Full GC.*\[CMS[0-9].*: \[CMS-" | nawk 'BEGIN{count = 0}{count = count + $(NF -1)}END{print count}'` fgc_dur=`echo "scale=2; $total_fgc_stw_dur / $fgc_count" | bc -l` else fgc_int="N/A" fgc_dur="N/A" fi # # Concurrent Mode Failure stats # cmf_count=`grep "concurrent mode failure" $GC_FILE | wc -l | sed 's/^ *//g'` # # CMS GC stats # init_sec=`grep "CMS-initial-mark" $GC_FILE | head -1 | nawk -F: '{print $1}'` final_sec=`grep "CMS-initial-mark" $GC_FILE | tail -1 | nawk -F: '{print $1}'` cms_count=`grep "CMS-initial-mark" $GC_FILE | wc -l` if [ $cms_count -gt 0 ]; then cms_int=`echo "scale=2; ($final_sec - $init_sec) / $cms_count" | bc -l` [[ $cms_int -eq 0 ]] && cms_int="N/A" total_cms_stw_dur=`egrep "CMS-initial-mark|CMS-remark" $GC_FILE | nawk 'BEGIN{count=0}{count = count + $(NF-1)}END{print count}'` cms_dur=`echo "scale=2; $total_cms_stw_dur / $cms_count" | bc -l` else cms_int="N/A" cms_dur="N/A" fi # # JVM Throughput # # For Entire Duration # jvm_init_sec=`head -1 $GC_FILE | nawk -F: '{print $1}'` jvm_final_sec=`tail -1 $GC_FILE | nawk -F: '{print $1}'` jvm_dur=`echo "scale=3; $jvm_final_sec - $jvm_init_sec" | bc -l` jvm_gc_dur=`nawk 'BEGIN{count=0}{count = count + $(NF-1)}END{print count}' $GC_FILE` jvm_tput=`echo "scale=2; 100 - ($jvm_gc_dur * 100 / $jvm_dur)" | bc -l` # # For TPUT_INTERVAL # if [ "`isNum $TPUT_INTERVAL`" != "no" ]; then tput_init_sec=`nawk -F: -v lt=$jvm_final_sec -v sam=$TPUT_INTERVAL '{if ( (lt - $1) <= sam) print $1}' $GC_FILE | head -1` tput_dur=`echo "scale=3; $jvm_final_sec - $tput_init_sec" | bc -l` sed -n "/$tput_init_sec/,/$jvm_final_sec/p" $GC_FILE > $GC_LATEST_FILE if [ -s $GC_LATEST_FILE ]; then tput_gc_dur=`nawk 'BEGIN{count=0}{count = count + $(NF-1)}END{print count}' $GC_LATEST_FILE` sam_tput=`echo "scale=2; 100 - ($tput_gc_dur * 100 / $tput_dur)" | bc -l` sam_cmf_count=`grep "concurrent mode failure" $GC_LATEST_FILE | wc -l | sed 's/^ *//g'` sam_jvm_gc_dur=`nawk 'BEGIN{count=0}{count = count + $(NF-1)}END{print count}' $GC_LATEST_FILE` else print "`date '+%d-%m-%Y %H:%M'` : ERROR : Could not extract GC data for specified interval.\n" fi fi # # Check threshold and raise alarm, if required # if [ $sam_tput -lt $JVM_TPUT_THRESHOLD ]; then print "`date '+%d-%m-%Y %H:%M'` : WARNING : JVM THROUGHPUT IN LAST $TPUT_PERIOD = $sam_tput %. Threshold (< ${JVM_TPUT_THRESHOLD} ) breached.\n" >> $LOG_FILE fi if [ $sam_cmf_count -ge $CMF_THRESHOLD ]; then print "`date '+%d-%m-%Y %H:%M'` : WARNING : #CONCURRENT MODE FAILURE(S) IN LAST $TPUT_PERIOD = $sam_cmf_count. Threshold (>= ${CMF_THRESHOLD} ) breached.\n" >> $LOG_FILE fi # # CMS GC Trigger Bands # total_trigger_count=0 count1_20=0 count21_40=0 count41_50=0 count51_60=0 count61_70=0 count71=0 oldgensize=$(( (xmx - maxnewsize) * 1024 )) nawk -v og=$oldgensize '/CMS-initial-mark/ {printf "%3.2f\n", (substr($5,1,index($5,"K")-1) * 100)/og}' $GC_FILE | while read line do if [ $line -le 20 ]; then count1_20=$(( count1_20 + 1 )) elif [ $line -gt 20 -a $line -le 40 ]; then count21_40=$(( count21_40 + 1 )) elif [ $line -gt 40 -a $line -le 50 ]; then count41_50=$(( count41_50 + 1 )) elif [ $line -gt 50 -a $line -le 60 ]; then count51_60=$(( count51_60 + 1 )) elif [ $line -gt 60 -a $line -le 70 ]; then count61_70=$(( count61_70 + 1 )) elif [ $line -gt 70 ]; then count71=$(( count71 + 1 )) fi total_trigger_count=$(( total_trigger_count + 1 )) done if [ $total_trigger_count -gt 0 ]; then count1_20_pct=$(( count1_20 * 100 / total_trigger_count )) count21_40_pct=$(( count21_40 * 100 / total_trigger_count )) count41_50_pct=$(( count41_50 * 100 / total_trigger_count )) count51_60_pct=$(( count51_60 * 100 / total_trigger_count )) count61_70_pct=$(( count61_70 * 100 / total_trigger_count )) count71_pct=$(( count71 * 100 / total_trigger_count )) else cms_gc=no fi # # Display Report # clear cat <70 : $count71 ( ~ $count71_pct %) --------------------------------------------------------------------------------- ========================================================================================================= ** NOTE ** : If you have concurrent mode failures in your GC logs or have -XX:+TraceClassUnloading enabled, some Full GC stats (average interval and average duration) may not be accurate. ParNew and CMS stats have always been accurate during testing. EOF fi # # Remove temp files # [[ -f $GC_FILE ]] && rm $GC_FILE [[ -f $GC_LATEST_FILE ]] && rm $GC_LATEST_FILE