#!/usr/my-local/bin/perl -w #,---- #| ROC_FROM_C4.5.PL --- Generate ROC curve from C4.5 tree #| Created Thu Feb 5 2004 by Tom Fawcett (tfawcett@acm.org) #| Copyright (2004) by Tom Fawcett #| $Id$ #`---- # my ($Usage) = "perl ROC_from_c4.5.pl tree_file pos_class This is a simple script to generate ROC points from a C4.5 tree. Inputs: tree_file should be the C4.5 log file containing the ASCII version of the tree (NOT the internal form) pos_class is the name of the class that should be considered the positive class. Outputs: Prints to stdout the ROC tuples. One point on each line, values separated by spaces. Points will be non-decreasing by X value (FP rate). Limitations: 1. Only handles two-class problems. 2. If you provide a log file containing several trees (eg, original and pruned) the script will very likely do the wrong thing. I could make it more robust and have it check carefully for beginning and end of the tree, but I haven't bothered. 3. If the tree goes so wide that it generates subtrees (via the [Snn] markers), script might not work. Or maybe it will. "; ############################################################################ use English; use strict; my ( $file, $pos_class ) = @ARGV; unless ( defined($file) and defined($pos_class) ) { die "Usage:\n$Usage\n"; } my ($neg_class); my (@points); my (%count); open( IN, $file ) or die "open($file): $!"; while () { if (/:\s+(\S+)\s+\(([\d\.]+)(\/([\d\.]+))?\)/) { my $class = $1; my $matched = $2; my $errors = 0; $errors = $3 if defined($3); my $TP = $matched - $errors; my $threshold = $TP / $matched; $count{$class} += $TP; if ( $class eq $pos_class ) { push( @points, [ $threshold, $errors, $TP, 1 ] ); } else { $threshold = 1 - $threshold; push( @points, [ $threshold, $TP, $errors, 0 ] ); } } } close(IN); if ( keys %count == 0 ) { die "Didn't find a parseable C4.5 tree in $file\n"; } elsif ( keys %count != 2 ) { die "This script only works for 2-class problems!\n"; } elsif ( !defined( $count{$pos_class} ) ) { die "Never saw any instances of class $pos_class\n"; } for ( keys %count ) { $neg_class = $_ if $_ ne $pos_class } my ($FP) = 0; my ($TP) = 0; my (@ROC_points); for my $pt ( sort { $a->[0] <=> $b->[0] } @points ) { if ( $pt->[3] ) { # Positive instance $FP += $pt->[1]; $TP += $pt->[2]; } else { # Negative instance $FP += ( $count{$neg_class} - $pt->[0] ) / $count{$neg_class}; $pt->[2] = ( $count{$pos_class} - $pt->[1] ) / $count{$pos_class}; } } push( @pos_points, [ 0, 0 ], [ 1, 1 ] ); my (@sorted) = sort { $a->[0] <=> $b->[0] or $a->[1] <=> $b->[1] } ( @pos_points, @neg_points ); for my $pt (@sorted) { print join( " ", @$pt ), "\n"; } ##### End of ROC_from_c4.5.pl