external sort Algorithm
The external sort algorithm is a data processing technique used for sorting large volumes of data that cannot fit entirely in internal memory (RAM). It is particularly useful for situations where the data to be sorted is too large to be processed within the available memory constraints of a computer system. The algorithm works by dividing the input data into smaller chunks that can fit into the available memory, sorting these smaller chunks individually, and then merging them back together to produce a fully sorted dataset. External sorting is widely used in database management systems, as well as in data-intensive applications that require sorting of large datasets, such as search engines and data mining applications.
The basic approach of the external sort algorithm involves two primary steps: the sorting phase and the merging phase. During the sorting phase, the input data is read from an external storage device, such as a hard disk or solid-state drive, and divided into smaller chunks that can be loaded into the available memory. Each chunk is then sorted using an efficient in-memory sorting algorithm, such as quicksort, mergesort, or heapsort, and the sorted chunks are written back to the external storage device. In the merging phase, the sorted chunks are read back into memory and combined using a merge algorithm, which iteratively selects the smallest element from the front of each chunk and appends it to a new, fully sorted output file. This process continues until all elements have been merged into the final sorted output. In order to minimize the number of I/O operations and improve performance, external sorting algorithms often employ techniques such as multi-way merging and buffering to reduce the number of disk accesses required during the merging phase.
#!/usr/bin/env python
#
# Sort large text files in a minimum amount of memory
#
import os
import argparse
class FileSplitter:
BLOCK_FILENAME_FORMAT = "block_{0}.dat"
def __init__(self, filename):
self.filename = filename
self.block_filenames = []
def write_block(self, data, block_number):
filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
with open(filename, "w") as file:
file.write(data)
self.block_filenames.append(filename)
def get_block_filenames(self):
return self.block_filenames
def split(self, block_size, sort_key=None):
i = 0
with open(self.filename) as file:
while True:
lines = file.readlines(block_size)
if lines == []:
break
if sort_key is None:
lines.sort()
else:
lines.sort(key=sort_key)
self.write_block("".join(lines), i)
i += 1
def cleanup(self):
map(lambda f: os.remove(f), self.block_filenames)
class NWayMerge:
def select(self, choices):
min_index = -1
min_str = None
for i in range(len(choices)):
if min_str is None or choices[i] < min_str:
min_index = i
return min_index
class FilesArray:
def __init__(self, files):
self.files = files
self.empty = set()
self.num_buffers = len(files)
self.buffers = {i: None for i in range(self.num_buffers)}
def get_dict(self):
return {
i: self.buffers[i] for i in range(self.num_buffers) if i not in self.empty
}
def refresh(self):
for i in range(self.num_buffers):
if self.buffers[i] is None and i not in self.empty:
self.buffers[i] = self.files[i].readline()
if self.buffers[i] == "":
self.empty.add(i)
self.files[i].close()
if len(self.empty) == self.num_buffers:
return False
return True
def unshift(self, index):
value = self.buffers[index]
self.buffers[index] = None
return value
class FileMerger:
def __init__(self, merge_strategy):
self.merge_strategy = merge_strategy
def merge(self, filenames, outfilename, buffer_size):
buffers = FilesArray(self.get_file_handles(filenames, buffer_size))
with open(outfilename, "w", buffer_size) as outfile:
while buffers.refresh():
min_index = self.merge_strategy.select(buffers.get_dict())
outfile.write(buffers.unshift(min_index))
def get_file_handles(self, filenames, buffer_size):
files = {}
for i in range(len(filenames)):
files[i] = open(filenames[i], "r", buffer_size)
return files
class ExternalSort:
def __init__(self, block_size):
self.block_size = block_size
def sort(self, filename, sort_key=None):
num_blocks = self.get_number_blocks(filename, self.block_size)
splitter = FileSplitter(filename)
splitter.split(self.block_size, sort_key)
merger = FileMerger(NWayMerge())
buffer_size = self.block_size / (num_blocks + 1)
merger.merge(splitter.get_block_filenames(), filename + ".out", buffer_size)
splitter.cleanup()
def get_number_blocks(self, filename, block_size):
return (os.stat(filename).st_size / block_size) + 1
def parse_memory(string):
if string[-1].lower() == "k":
return int(string[:-1]) * 1024
elif string[-1].lower() == "m":
return int(string[:-1]) * 1024 * 1024
elif string[-1].lower() == "g":
return int(string[:-1]) * 1024 * 1024 * 1024
else:
return int(string)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--mem", help="amount of memory to use for sorting", default="100M"
)
parser.add_argument(
"filename", metavar="<filename>", nargs=1, help="name of file to sort"
)
args = parser.parse_args()
sorter = ExternalSort(parse_memory(args.mem))
sorter.sort(args.filename[0])
if __name__ == "__main__":
main()