external sort Algorithm

The external sort algorithm is a data processing technique used for sorting large volumes of data that cannot fit entirely in internal memory (RAM). It is particularly useful for situations where the data to be sorted is too large to be processed within the available memory constraints of a computer system. The algorithm works by dividing the input data into smaller chunks that can fit into the available memory, sorting these smaller chunks individually, and then merging them back together to produce a fully sorted dataset. External sorting is widely used in database management systems, as well as in data-intensive applications that require sorting of large datasets, such as search engines and data mining applications. The basic approach of the external sort algorithm involves two primary steps: the sorting phase and the merging phase. During the sorting phase, the input data is read from an external storage device, such as a hard disk or solid-state drive, and divided into smaller chunks that can be loaded into the available memory. Each chunk is then sorted using an efficient in-memory sorting algorithm, such as quicksort, mergesort, or heapsort, and the sorted chunks are written back to the external storage device. In the merging phase, the sorted chunks are read back into memory and combined using a merge algorithm, which iteratively selects the smallest element from the front of each chunk and appends it to a new, fully sorted output file. This process continues until all elements have been merged into the final sorted output. In order to minimize the number of I/O operations and improve performance, external sorting algorithms often employ techniques such as multi-way merging and buffering to reduce the number of disk accesses required during the merging phase.
#!/usr/bin/env python

#
# Sort large text files in a minimum amount of memory
#
import os
import argparse


class FileSplitter:
    BLOCK_FILENAME_FORMAT = "block_{0}.dat"

    def __init__(self, filename):
        self.filename = filename
        self.block_filenames = []

    def write_block(self, data, block_number):
        filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
        with open(filename, "w") as file:
            file.write(data)
        self.block_filenames.append(filename)

    def get_block_filenames(self):
        return self.block_filenames

    def split(self, block_size, sort_key=None):
        i = 0
        with open(self.filename) as file:
            while True:
                lines = file.readlines(block_size)

                if lines == []:
                    break

                if sort_key is None:
                    lines.sort()
                else:
                    lines.sort(key=sort_key)

                self.write_block("".join(lines), i)
                i += 1

    def cleanup(self):
        map(lambda f: os.remove(f), self.block_filenames)


class NWayMerge:
    def select(self, choices):
        min_index = -1
        min_str = None

        for i in range(len(choices)):
            if min_str is None or choices[i] < min_str:
                min_index = i

        return min_index


class FilesArray:
    def __init__(self, files):
        self.files = files
        self.empty = set()
        self.num_buffers = len(files)
        self.buffers = {i: None for i in range(self.num_buffers)}

    def get_dict(self):
        return {
            i: self.buffers[i] for i in range(self.num_buffers) if i not in self.empty
        }

    def refresh(self):
        for i in range(self.num_buffers):
            if self.buffers[i] is None and i not in self.empty:
                self.buffers[i] = self.files[i].readline()

                if self.buffers[i] == "":
                    self.empty.add(i)
                    self.files[i].close()

        if len(self.empty) == self.num_buffers:
            return False

        return True

    def unshift(self, index):
        value = self.buffers[index]
        self.buffers[index] = None

        return value


class FileMerger:
    def __init__(self, merge_strategy):
        self.merge_strategy = merge_strategy

    def merge(self, filenames, outfilename, buffer_size):
        buffers = FilesArray(self.get_file_handles(filenames, buffer_size))
        with open(outfilename, "w", buffer_size) as outfile:
            while buffers.refresh():
                min_index = self.merge_strategy.select(buffers.get_dict())
                outfile.write(buffers.unshift(min_index))

    def get_file_handles(self, filenames, buffer_size):
        files = {}

        for i in range(len(filenames)):
            files[i] = open(filenames[i], "r", buffer_size)

        return files


class ExternalSort:
    def __init__(self, block_size):
        self.block_size = block_size

    def sort(self, filename, sort_key=None):
        num_blocks = self.get_number_blocks(filename, self.block_size)
        splitter = FileSplitter(filename)
        splitter.split(self.block_size, sort_key)

        merger = FileMerger(NWayMerge())
        buffer_size = self.block_size / (num_blocks + 1)
        merger.merge(splitter.get_block_filenames(), filename + ".out", buffer_size)

        splitter.cleanup()

    def get_number_blocks(self, filename, block_size):
        return (os.stat(filename).st_size / block_size) + 1


def parse_memory(string):
    if string[-1].lower() == "k":
        return int(string[:-1]) * 1024
    elif string[-1].lower() == "m":
        return int(string[:-1]) * 1024 * 1024
    elif string[-1].lower() == "g":
        return int(string[:-1]) * 1024 * 1024 * 1024
    else:
        return int(string)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-m", "--mem", help="amount of memory to use for sorting", default="100M"
    )
    parser.add_argument(
        "filename", metavar="<filename>", nargs=1, help="name of file to sort"
    )
    args = parser.parse_args()

    sorter = ExternalSort(parse_memory(args.mem))
    sorter.sort(args.filename[0])


if __name__ == "__main__":
    main()

LANGUAGE:

DARK MODE: