#!/usr/bin/env python
#
# partition.py - Partitions files into fixed-size buckets, using one of
# several algorithms.
#
# (c) Chad Austin 2003
#
# References:
#   1) conversations with Josh Carlson - 2003.09.29
#   2) http://everything2.com/index.pl?node=bin%20packing
#   3) http://www.cs.queensu.ca/home/cisc365/1998Dawes/dynprognotes.html

import os
import sys

class File:
    def __init__(self, filename):
        self.filename = filename
        stat = os.stat(filename)
        self.size = stat.st_size

    def __str__(self):
        return str(self.filename) + " " + str(self.size)

    def __cmp__(self, other):
        return cmp(self.filename, other.filename)

    def __hash__(self):
        return hash(self.filename)

class Bucket:
    def __init__(self, file = None):
        if file:
            self.files = [file]
            self.size = file.size
        else:
            self.files = []
            self.size = 0

    def add(self, file):
        self.files.append(file)
        self.size += file.size

    def combine(a, b):
        r = Bucket()
        r.files = a.files + b.files
        r.size  = a.size  + b.size
        return r

    def show(self):
        print "    size: ", self.size
        print "    files:"
        for f in self.files:
            print "        ", f.filename, " - ", f.size

    def __str__(self):
        return repr((self.size, self.files))


def OrderedAlgorithm(files, maxBucketSize):
    buckets = []

    bucket = Bucket()
    for f in files:
        assert f.size <= maxBucketSize, "File too big for bucket"
        if not bucket:
            bucket = Bucket()
        if bucket.size + f.size > maxBucketSize:
            buckets.append(bucket)
            bucket = Bucket(f)
        else:
            bucket.add(f)
        
    if bucket.size:
        buckets.append(bucket)

    return buckets


def BetterOrderedAlgorithm(files, maxBucketSize):
    cache = {}
    
    def subcalc(start, end):
        cacheEntry = (start, end)
        if cacheEntry in cache:
            return cache[cacheEntry]

        if start >= end:
            cache[cacheEntry] = []
            return []

        solutions = []  # List of bucket list solutions.

        # Fill the biggest possible bucket at the front.
        b = Bucket()
        s = start
        while s < end and files[s].size + b.size <= maxBucketSize:
            b.add(files[s])
            s += 1

        fileCount = len(b.files)
        
        assert fileCount > 0

        for i in range(fileCount):
            c = Bucket()
            for f in b.files[0:fileCount-i]:
                c.add(f)
            assert len(c.files) > 0
            solutions.append([c] + subcalc(start + len(c.files), end))

        if solutions:
            best = solutions[0]
            for i in solutions[1:]:
                if len(i) > best:
                    best = i
            cache[cacheEntry] = best
            #print cacheEntry, best
            return best
        else:
            print "Empty bucket list found?"
            cache[cacheEntry] = []
            return []

    return subcalc(0, len(files))

def DecreasingFitFirstAlgorithm(files, maxBucketSize):
    files.sort(lambda a, b: cmp(b.size, a.size))
    
    buckets = []
    for f in files:
        added = False
        for b in buckets:
            if b.size + f.size <= maxBucketSize:
                b.add(f)
                added = True
                break
        if not added:
            buckets.append(Bucket(f))

    return buckets


def HuffmanAlgorithm(files, maxBucketSize):
    import copy
    
    buckets = map(Bucket, files)

    def smallest(bs):
        s = 0
        for t in range(1, len(bs)):
            if bs[t].size < bs[s].size:
                s = t
        b = bs[s]
        del bs[s]
        return b

    while len(buckets) > 1:
        orig = copy.copy(buckets)
        i = smallest(buckets)
        j = smallest(buckets)

        if i.size + j.size > maxBucketSize:
            return orig

        buckets.append(Bucket.combine(i, j))

    return buckets


def KnapsackAlgorithm(files, maxBucketSize):
    buckets = []

    while files:

        print "Calculating bucket..."
            
        table = {}  # cache of results

        # returns tuple (value, files)
        def V(i, size):
            key = (i, size)
            if table.has_key(key):
                return table[key]
            
            if i == 0:
                result = (0, [])
                table[key] = result
                return result
            
            f = files[i - 1]
            if f.size > size:
                result = V(i - 1, size)
                table[key] = result
                return result
            else:
                take = V(i - 1, size - f.size)
                dont = V(i - 1, size)
                result = max((take[0] + f.size, take[1] + [f]), dont)
                table[key] = result
                return result

        contents = V(len(files), maxBucketSize)[1]
        bucket = Bucket()
        for f in contents:
            bucket.add(f)
            files.remove(f)

        buckets.append(bucket)

    return buckets


# Too slow to be usable.
def SlowAlgorithm(files, maxBucketSize):
    import copy
    import heapq

    class State:
        def __init__(self, buckets):
            self.buckets = buckets

        def __cmp__(self, other):
            return cmp(len(self.buckets), len(other.buckets))

    initial = State(map(Bucket, files))
    best = initial
    states = [initial]
    
    while states:
        t = heapq.heappop(states)
        l = len(t.buckets)
        for i in range(l):
            for j in range(i + 1, l):
                bi = t.buckets[i]
                bj = t.buckets[j]

                if bi.size + bj.size <= maxBucketSize:
                    k = copy.copy(t.buckets)
                    k[i] = Bucket.combine(bi, bj)
                    del k[j]
                    
                    q = State(k)
                    if q < best:
                        best = q
                    print "# States remaining:", len(states), " Best so far:", len(best.buckets)
                        
                    heapq.heappush(states, q)

    return best.buckets


DEFAULT_MAX_SIZE = 700 * 1024 * 1024

def usage():
    print "Usage:"
    print "    partition.py [options] [paths]"
    print
    print "Options:"
    print "    -h              Print this usage information"
    print "    -s <size>       Max size of each bucket (in bytes) [%d]" % DEFAULT_MAX_SIZE
    print "    -a <algorithm>  Partitioning algorithm [ordered]"
    print "       ordered        All files remain ordered (naive)"
    print "       ordered2       All files remain ordered (optimal)"
    print "       dff            Decreasing Fit First (fast, near-optimal solution)"
    print "       huffman        Combine smallest buckets until done"
    print "       knapsack       Multiple Knapsack algorithm, pretty slow (but really good)"
    print "       slow           Optimal partitioning, unbearably slow"
    print "    --move=<name>   Move files into bucket directories <name>0, <name>1..."


def main():
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], "hs:a:", "move=")
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    # default option values
    maxBucketSize = DEFAULT_MAX_SIZE
    partition = OrderedAlgorithm
    paths = ['.']
    move = False
    name = ''

    if args:
        paths = args
    
    # parse options
    for o, a in opts:
        if o == "-h":
            usage()
            sys.exit()
        if o == "-s":
            maxBucketSize = int(a)
        if o == "-a":
            if a == "ordered":
                partition = OrderedAlgorithm
            elif a == "ordered2":
                partition = BetterOrderedAlgorithm
            elif a == "dff":
                partition = DecreasingFitFirstAlgorithm
            elif a == "huffman":
                partition = HuffmanAlgorithm
            elif a == "knapsack":
                partition = KnapsackAlgorithm
            elif a == "slow":
                partition = SlowAlgorithm
            else:
                usage()
                sys.exit(2)
        if o == "--move":
            if not a:
                usage()
                sys.exit(2)
            move = True
            name = a

    # build list of files
    def visit(arg, dirname, names):
        for n in names:
            path = os.path.join(dirname, n)
            if os.path.isfile(path):
                arg.append(File(path))
    files = []
    for p in paths:
        if os.path.isdir(p):
            os.path.walk(p, visit, files)
        else:
            files.append(File(p))

    # remove duplicates
    set = {}
    for f in files:
        set[f] = None
    files = [s for s in set.keys()]
    files.sort()

    for f in files:
        if f.size > maxBucketSize:
            print "WARNING: File '%s' is larger than bucket size!" % f.filename

    # run the partition algorithm
    buckets = partition(files, maxBucketSize)

    # display buckets
    if not move:
        for i in range(len(buckets)):
            print "Bucket %d:" % i
            buckets[i].show()
    else:
        def shmakedirs(dirs):
            import errno
            try:
                os.makedirs(dirs)
            except os.error, e:
                if e[0] != errno.EEXIST:
                    raise
        
        def shmove(bucket, srcpath):
            import shutil
            
            srcpath = os.path.normpath(srcpath)
            split = os.path.split(srcpath)
            targetdir = os.path.join(bucket, split[0])
            
            shmakedirs(targetdir)
            
            shutil.move(srcpath, os.path.join(bucket, srcpath))
        
        for i in range(len(buckets)):
            dir = name + str(i)
            print "Creating bucket", dir

            for f in buckets[i].files:
                print "   Adding", f.filename
                shmove(dir, f.filename)


if __name__ == "__main__":
    main()

