#!/usr/local/bin/perl -w # $Id: CartesianProduct.pm $ # This Perl module is an iterator over a Cartesian product. # Written by Steven Tolkin # (This file is not executable. We include the first line so the Unix # command named "type" shows the file type as something like "perl script".) # This section based upon the Module Template 12.18 in _The Perl Cookbook_ package Set::CartesianProduct; require 5.004; use strict; # use integer helped performance a lot on Sparc, but barely at all on Intel use integer; use Carp; my $dbg = 1; # temp to debug # Maybe later provide the caller a way to turn off assertion checking # to improve performance slightly. # The only one that matters is the one at the beginning of fetch() # as the others only happen once per result set. # Maybe later remove that first one. # For maximum performance maybe should have two entirely separate # versions, e.g. fetch and fetch_no_assert. # But I have measured the performance and assert has a minor impact. my $check_assert = 1; require Exporter; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); $VERSION = 1.002; # Manually maintained @ISA = qw(Exporter); # No need to export anything for object oriented modules. @EXPORT = qw ( ); @EXPORT_OK = qw ( COLUMN_MAJOR_ORDER ROW_MAJOR_ORDER ); =pod =head1 NAME Set::CartesianProduct - Iterator over the Cartesian Product of sets, bags, or lists =head1 SYNOPSIS use Set::CartesianProduct; # or import some constants, e.g. use Set::CartesianProduct ('COLUMN_MAJOR_ORDER'); # The caller sends a list of "sets" (of scalars) to new(), e.g. my $list_of_sets = [ [qw(a another)], [qw(red pink)], [qw(rose tulip)] ]; my $iter = Set::CartesianProduct->new($list_of_sets); my $row; while ($row = $iter->fetch() ) {print "@{$row}\n";} # Other features and functions # The new() call takes an optional argument where any true value # suppresses the warning about empty input. $iter->new($list_of_sets,1); # The default traversal order is ROW_MAJOR_ORDER. This can be changed # by set_traversal_order(), but only when positioned at the start, # e.g. after calling rewind(). The count() function returns the total # numbers of rows in the cartesian product. $iter->rewind(); $iter->set_traversal_order(COLUMN_MAJOR_ORDER); my $save_traversal_order = $iter->get_traversal_order(); =head1 DESCRIPTION This program iterates over the Cartesian product of a list of sets. In fact the "sets" can also be bags (multi-sets) or lists. At each step in the iteration it produces one output "tuple", which (following the relational model) we call a "row". When there are no more rows it returns the special scalar: undef. If called again after returning undef it raises an exception, i.e. croaks. If the input "sets" have duplicates then the output will have duplicate rows. The default traversal order is row major order, as in the C language. This is like an odometer -- the rightmost array index varies most rapidly. There is a way to set the opposite order, called column major order. By default this emits a warning message if there are no input "sets", or if any are empty. There is a way to suppress this warning, because while this situation often indicates a bug, it is valid. If there are a small fixed number of sets the usual way to iterate over the Cartesian product is to write nested loops, one for each set. This module is useful when the numbers of sets is not known at the time the program is being written, or when the number of sets is "large", e.g. more than 4. Some simple performance tests indicate that in the worst case (where no processing done on each row) this module is 20 to 30 time slower than using hard coded nested loops. For efficiency this simply uses the references to the sets that are passed in, rather than making a "deep copy" of each one. The caller should not change any of the input sets while using an object returned by this module. In particular reducing the size of a set can cause arbitrary behavior, increasing the size of a set will not be detected. Under some circumstance changes the value of a sclara reference might be a feature. =head1 SEE ALSO List::Combination is a module that iterates over all the combinations of items in a set, bag or list, e.g. taken 2 at a time, or 3 at a time etc. This is a different operation, but sometimes the word "combination" is mistakenly used to refer to Cartesian product. =head1 NOTES This module must be installed in the Set directory of the perl library directory, e.g. perhaps in \perl\lib\set on Microsoft systems. The required argument to new is a list of "sets". These sets contain scalars. In the example the scalars are strings, but they can be any type of scalars, including the special scalar undef. Processing undef in the caller's function will as usual produce the warning "Use of unitialized value ..." unless the caller turns off warnings, e.g. with $^W = 0; The Cartesian product is the basis for the relational database operator named join. However databases typically use special implementations to achieve adequate performance when doing a join. This has been placed in the Set:: hierarchy rather than in List:: because a module belongs in the lowest appropriate place in the hierarchy, rather than the highest. An operator that applies to a set is likely to make sense for all higher types (i.e. bags and lists), but not vice versa. For example the operation count() makes sense for all three type of containers, whereas first() only applies to lists. (The Combination module should have been in Set:: in my opinion.) Some exceptions raised by this program correspond to what would be compile time errors in a strongly typed language. Others correspond to run time fatal errors. Placing the call in an eval block will trap these, as usual. =head1 AUTHOR Steven Tolkin home: work: permanent forwarding: =head1 COPYRIGHT Copyright (c) 1999 Steven Tolkin. All rights reserved. This module can be distributed under the same terms as Perl itself. =cut # Changes: # Who When What # tolkin 1998-04-01 started from a program by John Redford # renamed variables, changed termination condition etc. # tolkin 1998-04-03 changed main loop to use a carry based approach # change to emit the results in row major by default # but allow column major if preferred. (NOT hooked up to user option yet.) # tolkin 1998-04-06 Fixed location of safety check, formatting, renamed # product.pl to product.pm, and wrote testprod.pl which see. # tolkin 1999-01-18 Renamed (again!) to CartProd (and temporarily to CartIter) # Changed to use an iterator style, and object oriented # restored use strict (needs perl 5.004). Added pod, by copying Set::Bag # tolkin 1999-01-23 Added prototypes, broke out _init, created set_order, # get_order, and is_row_major, fixed minor bugs, added comments, # Wrote tests of all functionality and all croak calls -- see testcart.pl # tolkin 1999-01-24 More improvements and tests. Renamed testprod.pl # to testcart.pl. Moved this code into Set::CartesianProduct # tolkin 1999-01-27 Added many suggestions made at the Boston Perl Mongers # group: all upper case for labels and constant functions, etc. # Made some of the suggested performance improvements too, but they # did not really make a difference; see below. # Changed {last} to {"last"} to avoid spurious warning from some versions # of perl about "Ambiguous use of last ..." # Renamed next to fetch, to prevent clash with a perl keyword. # tolkin 1999-01-31 Added count(), fetch_all(), rewind(), improved # handling of assertions # # For other programs to do this see the responses to my posting to # news://comp.lang.perl.misc on Friday, March 27, 1998 with Subject: # How to generate a Cartesian product of a varying number of sets? # One reason I chose this implementation is that it just # iterates over arrays, with no hashes, stacks, trees, or recursion. # It can be ported to almost any language, and should have good efficiency. # (But any approach to generating the rows is likely # to be cheap compared to processing them.) # Do Later: # Write description of each function. # Consider adding prototypes to the functions. # But that does not provide much value, see _Advanced Perl Programming_ # Check what happens if fetch called in a list context # Replace hard coded function names in croak etc with e.g. (caller(0))[3] # or whoami() as in recipe 104. # Maybe later have fetch in list context return the actual row. # This could simplify fetch_n a little bit. # This is about 20 to 30 times slower than hard coded nested loops. # Measure the performance of my earlier implementation that # was not an iterator, and not object oriented, but which took # a user function to call as an argument. # I changed from that approach because iterators are simpler, and avoid # the issue of needing to provide a way to stop on a false return code, etc. # However the code is now more awkward, e.g. I had to change from # $row to $i->{row} etc. and I suspect this causes a lot of the slowness. # Try using an array instead of a hash for the object, and then use # constants via subroutines to maintain readbaility. # The code could be simplified by basing it on an abstract # data type of "circular array". This is like an array in traditional # languages, e.g. C, in that it has a fixed number of elements. There is # an operation next that advances from the last element to the first. # However I suspect this might degrade performance. # Consider it if I ever implement a bidirectional or random iterator. # In the C++ Standard Template Library an iteration indicates it is done # by returning its endpoint, rather than null (i.e. undef). See if that # approach makes sense in perl. Look into adopting this algorithm # for C++ and/or Java. # Later try to find a more principled way to handle assertions. # My current approach uses both carp (to emit the line number in the caller) # and die (NOT croak) to emit the line number in the module. # There might be a way using caller() to produce these in a simpler way. # Misc. comments: # The module is named CartesianProduct even though it is suggested # that modules have names that are <= 8 characters. I prefer to have # the meaningful name, rather than e.g. CartProd.pm, even if it means # this will not be useable in DOS. # # Note that the term "Cartesian product" was originally used # to refer to sets only, and only two sets at that. # But it has an obvious generalization to an arbitrary number of sets, or # bags or lists, and this implementation supports them in the natural way. # It can also be generalized to "higher" containers, e.g. see the paper # "Isoperimetric Number of the Cartesian Product of Graphs and Paths" # by M. Cemil Azizoglu and Omer Egecioglu (that O should have an umlaut) # Technical Report TRCS98-15, Computer Science Department, # University of California, Santa Barbara, May 1998. # A Postscript file is available at # # # The concept of the traversal order is orthogonal to the concept of the # category of the iterator. This module currently is a forward iterator, # to use the C++ terminology. # Extensions to this class could also support a bidirectional # iterator, a reverse iterator, or a random access iterator. # # This kind of iterator has only 2 reasonable values for traversal order # so I could have use a "boolean" value to set and get the order. # Instead I have adopted an approach (using a set of constants) that can # be extended to other types of iterators. For example # an iterator over a binary tree can be pre_order, post_order, or in_order. # I use an assertion exclusively to detect a bug in the code, i.e. # an "impossible" condition. That is why the text of # the message begins "Bug:". Hopefully these will never appear. # If they do the user should report the bug to me, and should include # a small reproducible test case. # # I iterate over the sets using array subscripts, rather than the more # "perlish" foreach $s (@sets) to make it somewhat easier to port # the algorithm to traditional languages such as C++ or Java, and to # make it slightly easier to understand for programmers who know # those languages but not Perl. The performance loss should be negligible # except in a rare case of having an extremely large number of sets. # # In the fetch_all() case I do not need to handle the first row as a special # case. But if I don't then I need to move the location of some code. # So I decided to keep fetch and fetch_all similar to keep # fetch_all similar to fetch. # I considered adding get_rows_done() -- but would require keeping # the counter, so maybe not desirable for a minimal interface. # I prefer the Java style get_foo and set_foo (e.g. for traversal_order) # to the possibility of having one function that looks at its arg. # ********* Code Begins Again ******** # ***** Constants ***** # For traversal order # Use subroutines returning constants as constants. sub ROW_MAJOR_ORDER() { 0 } # the default sub COLUMN_MAJOR_ORDER() { 1 } # ***** Functions ***** # # I use $i as the "handle" for the object, rather than $self, # as it is shorter to type, and reminiscent of iterators. # Later in addition to croak call a usage() function, as in shell scripts. # This would give room to explain in the error message e.g. that sets # are numbered starting with 0. sub new { my $type = shift; my $i = {}; # Anonymous hash we populate below bless $i, $type; $i->_init(@_); # call _init with remaining args return $i; # return a reference to the hash } # As in _The Perl Cookbook_ most setup is done in a separate _init() function. # (As usual function names starting with an underscore are intended # to be private to this module.) sub _init { my $i = shift; my $sets = shift; if ( ! defined($sets)) { croak "Error: ", ref($i), " new() needs an argument for lists of sets"; } if ( ! UNIVERSAL::isa($sets, 'ARRAY')) { croak "Error: ", ref($i), " new() needs first argument to be array ref"; } $i->{n_sets} = @$sets; # save the number of sets my $n_sets = $i->{n_sets}; # Maybe later combine this loop with the one below. # But I prefer the style of checking input for validity before # doing the real processing, and the performance loss is miniscule. my $s; # the currently active set for ($s = 0; $s < $n_sets; $s++) { if ( ! UNIVERSAL::isa($$sets[$s], 'ARRAY')) { croak "Error: ", ref($i), " new() needs set[$s] to be array ref"; } } $i->{sets} = $sets; # passed all checks, so make it an instance variable # Maybe rename variable to e.g. suppress_warn_on_empty. my $opt_warn = shift; my $warn_on_empty = ! $opt_warn; # *** One time only Initialization (see also rewind() *** # Number of rows in the result, computed by multiplying input counts $i->{count} = 1; # It is *not* an error to have no sets or some empty sets. # By default we warn, as this is often a symptom of a bug in the caller. if (0 == $n_sets) { $i->{count} = 0; if ($warn_on_empty) { carp "Warning: No sets in input"; } } for ($s = 0; $s < $n_sets; $s++) { my $size = @{$$sets[$s]}; # Number of values in current set $i->{count} *= $size; # can set count to 0 # set the index of the last value in each set $i->{"last"}[$s] = $size - 1; if ( $size == 0) { if ($warn_on_empty) { carp "Warning: Input set[$s] is empty"; } } } $i->rewind(); # Initialize to start at first row $i->set_traversal_order(ROW_MAJOR_ORDER); # the default } ### Other exportable functions sub count() { my $i = shift; if ( 0 != scalar @_) { croak "Error: ", ref($i), " count() takes no arguments"; } $i->{count}; } # rewind() just returns 1 for success. # How could this fail? Is there anything useful to return? sub rewind() { my $i = shift; if ( 0 != scalar @_) { croak "Error: ", ref($i), " rewind() takes no arguments"; } my $s; # the currently active set my $sets = $i->{sets}; # factor out common code by using a local variable my $n_sets = $i->{n_sets}; for ($s = 0; $s < $n_sets; $s++) { # Create the first row from the first (0th) value in each set $i->{curr}[$s] = 0; $i->{row}[$s] = $$sets[$s][0]; } $i->{n_done} = 0; # Count up rows as we create them $i->{at_start} = 1; $i->{after_last} = 0; 1; # return true for success } sub get_traversal_order() { my $i = shift; if ( 0 != scalar @_) { croak "Error: ", ref($i), " get_traversal_order() takes no arguments"; } $i->{order}; } sub set_traversal_order($) { my $i = shift; if ( 1 != scalar @_) { croak "Error: ", ref($i), " set_traversal_order() needs 1 argument"; } if ( ! $i->{at_start}) { croak "Error: ", ref($i), " set_traversal_order() can only be called at start"; } my $opt = shift; if ( $opt == COLUMN_MAJOR_ORDER || $opt == ROW_MAJOR_ORDER ) { $i->{order} = $opt; } else { croak "Error: ", ref($i), " set_traversal_order() invalid argument: $opt" }; # if row major then the right end changes fastest, else the left end $i->{fastestset} = $i->{order} == ROW_MAJOR_ORDER ? $i->{n_sets} - 1 : 0 ; } ############ # Most of the work of iterating is done in the method named fetch(). # We need to handle the first call to it specially, because we must # advance the iterator before returning. We could eliminate this # special case if we could initially position the iterator # just before the first row. The current approach has the virtue that # each call to fetch will takes a small and bounded amount of time, as all the # initialization work was done in new. That seems better than having # a lot of initialization done in the first call to fetch(). # I have kept all the semantically correct performance optimizations, e.g. # localize instance variables, even though they did not measurably help. sub fetch() { my $i = shift; # get back reference to our anonymous hash if (scalar @_) { croak "Error: ", ref($i), " fetch() does not take any arguments"; } # Special case to return the first row before iterating # We only want to say $i->{at_start} = 0; once, for performance. if ($i->{at_start}) { $i->{at_start} = 0; if ( 0 == $i->{count}) { # handle the case of an empty result $i->{after_last} = 1; return ; # undef in scalar context, empty list in list context } else { $i->{n_done}++; return $i->{row}; } } # Maybe later move this assertion to where we increment n_done. # Test the assertion first, because if the assertion fails # it is likely the next test will fail also, and if there is a bug # we do not want to claim it is the caller's error. if ( $check_assert && ($i->{n_done} > $i->{count}) ) { carp "Please report this system bug:"; carp "When ", ref($i), " fetch() was called"; die "Assert failed: done ($i->{n_done}) > count ($i->{count})"; } my $after_last = $i->{after_last}; if ( $after_last) { croak "Error: ", ref($i), " fetch() invalid: iterator after last item"; } # Each iteration increments by one the index in the fastest set. # But if we were at the last value we need to "carry" over # to the next set, as in elementary addition. The direction of that # next set depends on whether we are using row major or column major. # Incrementing an index might require several carries, e.g. # adding 1 second to a time of 7:59:59 becomes 8:00:00. # Rather than carry past the last set we set after_last to true # and exit the carry loop. my $s = $i->{fastestset}; # Manually factor out common expressions, for performance. my $curr = $i->{curr}; my $c; # will hold a local copy of $curr->[$s] my $row = $i->{row}; # *** The carry loop increments components of the row as needed *** CARRY: while ( 1 ) { # In the typical case almost all the time we only increment the # fastest changing set, i.e. we do the true arm of this test. # So the "slowness" is right here, probably due to the subscripts. $c = $curr->[$s]; if ($c < $i->{"last"}[$s]) { $curr->[$s] = ++$c; # Must update the object, not just local copy $row->[$s] = $i->{sets}[$s][$c]; last CARRY; } else { $curr->[$s] = 0; $row->[$s] = $i->{sets}[$s][0]; if ($i->{order} == ROW_MAJOR_ORDER) { $s--; if ($s < 0) { # we've reached the end of the iterator $after_last = 1; last CARRY; } } else { # not is_row_major $s++; if ($s == $i->{n_sets}) { # we've reached the end of the iterator $after_last = 1; last CARRY; } } } } # end CARRY loop if ($after_last) { $i->{after_last} = 1; if ( $check_assert && ($i->{n_done} != $i->{count}) ) { carp "Error: Please report this system bug:"; carp "When ", ref($i), " fetch() was called"; die "Assert failed: done ($i->{n_done}) != count ($i->{count})"; } return ; # undef in scalar context, empty list in list context } else { $i->{n_done}++; return $i->{row}; } } # end sub fetch # fetch_n() returns an array containing n rows. # Note that fetch_n(1) IS NOT The same as fetch() -- fetch() returns a # row directly, whereas fetch_n(1) "wraps" that row in an array. sub fetch_n { my $i = shift; # get back reference to our anonymous hash my $how_many = shift; if ( ! (defined $how_many) || scalar @_) { croak "Error: ", ref($i), " fetch_n() takes 1 argument"; } # Later maybe test that arg is a positive *integer*. if ( ! $how_many > 0 ) { croak "Error: ", ref($i), " fetch_n() argument must be > 0"; } my @result = (); # return a reference to an array of rows my $row; while ( $how_many-- ) { # We need to stop the stop as soon as we are at the end $row = $i->fetch(); last if (! $row); # Make an anonymous copy of row and save it in the result push (@result, [ @{$row} ] ); } @result; } # end sub fetch_n # fetch_all is like fetch_n() with an argument of $i->count() # except that it always calls rewind first. # When called in a list context e.g. @rows = $h->fetch_all() # it returns a reference to the materialized array. # If called in a scalar context it returns the count. # There might be a better way to force list context than using @result !!! # In fact is this even needed? sub fetch_all { my $i = shift; # get back reference to our anonymous hash if (scalar @_) { croak "Error: ", ref($i), " fetch_all() takes no arguments"; } if ( ! defined wantarray()) { return ; # void context } elsif ( ! wantarray()) { return $i->{count}; # scalar context } # else list context my @result; $i->rewind(); @result = $i->fetch_n($i->count()); @result; } # end sub fetch_all 1; # a module must return some true value # end of file