This post has already been read 2213 times!

Problem

If you have a 2 GB file with one string per line, which sorting algorithm would you use
to sort the file and why?

Solution

When an interviewer gives a size limit of 2GB, it should tell you something - in this case, it
suggests that they don¡¯t want you to bring all the data into memory
So what do we do? We only bring part of the data into memory

Algorithm

How much memory do we have available? Let's assume we have X MB of memory available

  • Divide the file into K chunks, where X * K = 2 GB Bring each chunk into memory and
    sort the lines as usual using any O(n log n) algorithm Save the lines back to the file
  • Now bring the next chunk into memory and sort
  • Once we are done, merge them one by one

The above algorithm is also known as external sort. Step 3 is known as N-way merge. The rationale behind using external sort is the size of data Since the data is too huge and we cant bring it all into memory, we need to go for a disk based sorting algorithm

After thinking long and hard about this problem, it didn’t occur to me the issue would be putting the whole 2 gigs into memory to be sorted. Now that I think about it, having a quarter or half of you RAM sucked into just sorting a massive file sucks. So the idea needed here is just to chunk the file into smaller pieces. For this problem I guess doing a full mergesort would work since it runs the quickest in O(nlogn) time, since we will need to merge the pieces together in the end anyways.

For simplicity, I’m going abstract the details of the mergesort away, since the takeaway for this problem is to chunk the file into manageable sorted pieces and then merge those sorted pieces together.

Algorithm for External Sorting

def large_file_sort(chunksize, file):
    f = open(file, 'r+')
    chunk = []
    curr = 0
    i = 0
    for line in f:
        index = i % chunksize
        chunk[index] = line
        if index == chunksize - 1:
            mergesort(chunk)
            prev = curr
            curr = f.tell
            f.seek(curr)
            f.write(join(chunk))
     file_merge(chunksize, file)
     f.close()

def file_merge(chunksize, file): f = open(file, 'r+) acc = [] chunk = [] i = 0 for line in f: index = i % chunksize chunk[index] = line if index == chunksize - 1 merge(acc, chunk)

Code for External Sorting in C Programming


#include "stdio.h"
#include "conio.h"
#include "stdlib.h"

void shownums ( char * ) ;
void split ( char * ) ;
void sort ( char * ) ;

void main( )
{
    char str[67] ;

    clrscr( ) ;

    printf ( "Enter file name: " ) ;
    scanf ( "%s", str ) ;

    printf ( "Numbers before sorting:\n" ) ;
    shownums ( str ) ;

    split ( str ) ;
    sort ( str ) ;

    printf ( "\nNumbers after sorting:\n" ) ;
    shownums ( str ) ;

    getch( ) ;
}

/* Displays the contents of file */
void shownums ( char *p )
{
    FILE *fp ;
    int i ;

    fp = fopen ( p, "rb" ) ;
    if ( fp == NULL )
    {
        printf ( "Unable to open file." ) ;
        getch( ) ;
        exit ( 0 ) ;
    }

    while ( fread ( &i, sizeof ( int ), 1, fp ) != 0 )
        printf ( "%d \t", i ) ;

    fclose ( fp ) ;
}

/* Splits the original file into two files */
void split ( char *p )
{
    FILE *fs, *ft ;
    longint l, count ;
    int i ;

    fs = fopen ( p, "rb" ) ;
    if ( fs == NULL )
    {
        printf ( "Unable to open file." ) ;
        getch( ) ;
        exit ( 0 ) ;
    }

    ft = fopen ( "temp1.dat", "wb" ) ;
    if ( ft == NULL )
    {
        fclose ( fs ) ;
        printf ( "Unable to open file." ) ;
        getch( ) ;
        exit ( 1 ) ;
    }

    fseek ( fs, 0L, SEEK_END ) ;
    l = ftell ( fs ) ;
    fseek ( fs, 0L, SEEK_SET ) ;

    l = l / ( sizeof ( int ) * 2 ) ;

    for ( count = 1 ; count <=  l ; count++ )
    {
        fread ( &i, sizeof ( int ), 1, fs ) ;
        fwrite ( &i, sizeof ( int ), 1, ft ) ;
    }

    fclose ( ft ) ;

    ft = fopen ( "temp2.dat", "wb" ) ;
    if ( ft == NULL )
    {
        fclose ( fs ) ;
        printf ( "Unable to open file." ) ;
        getch( ) ;
        exit ( 2 ) ;
    }

    while (    fread ( &i, sizeof ( int ), 1, fs ) != 0 )
        fwrite ( &i, sizeof ( int ), 1, ft ) ;

    fcloseall ( ) ;
}

/* Sorts the file */
void sort ( char *p )
{
    FILE *fp[4] ;
    char *fnames[ ] =
                    {
                        "temp1.dat",
                        "temp2.dat",
                        "final1.dat",
                        "final2.dat"
                    } ;

    int i, j = 1, i1, i2, flag1, flag2, p1, p2 ;
    longint l ;

    while ( 1 )
    {
        for ( i = 0 ; i <= 1 ; i++ )
        {
            fp[i] = fopen ( fnames[i], "rb+" ) ;
            if ( fp[i] == NULL )
            {
                fcloseall( ) ;
                printf ( "Unable to open file." ) ;
                getch( ) ;
                exit ( i ) ;
            }

            fseek ( fp[i], 0L, SEEK_END ) ;
            l = ftell ( fp[i] ) ;
            if ( l == 0 )
                gotoout ;
            fseek ( fp[i], 0L, SEEK_SET ) ;
        }

        for ( i = 2 ; i <= 3 ; i++ )
        {
            fp[i] = fopen ( fnames[i], "wb" ) ;
            if ( fp[i] == NULL )
            {
                fcloseall( ) ;
                printf ( "Unable to open file." ) ;
                getch( ) ;
                exit ( i ) ;
            }
        }

        i = 2 ;
        i1 = i2 = 0 ;
        flag1 = flag2 = 1 ;

        while ( 1 )
        {
            if ( flag1 )
            {
                if ( fread ( &p1, sizeof ( int ), 1, fp[0] ) == 0 )
                {
                    /* If first file ends then the whole content of second
                       file is written in the respective target file */
while ( fread ( &p2, sizeof ( int ), 1, fp[1] ) != 0 )
                        fwrite ( &p2, sizeof ( int ), 1, fp[i] ) ;
                    break ;
                }
            }

            if ( flag2 )
            {
                if ( fread ( &p2, sizeof ( int ), 1, fp[1] ) == 0 )
                {
                    /* If second file ends then the whole content of first
                       file is written in the respective target file */

                    fwrite ( &p1, sizeof ( int ), 1, fp[i] ) ;
                    while ( fread ( &p1, sizeof ( int ), 1, fp[0] ) != 0 )
                        fwrite ( &p1, sizeof ( int ), 1, fp[i] ) ;
                    break ;
                }
            }

            if ( p1 < p2 )
            {
                flag2 = 0 ;
                flag1 = 1 ;
                fwrite ( &p1, sizeof ( int ), 1, fp[i] ) ;
                i1++ ;
            }
            else
            {
                flag1 = 0 ;
                flag2 = 1 ;
                fwrite ( &p2, sizeof ( int ), 1, fp[i] ) ;
                i2++ ;
            }

            if ( i1 == j )
            {
                flag1 = flag2 = 1 ;
                fwrite ( &p2, sizeof ( int ), 1, fp[i] ) ;
                for ( i2++ ; i2 < j ; i2++ )
                {
                    if ( fread ( &p2, sizeof ( int ), 1, fp[1] ) != 0 )
                        fwrite ( &p2, sizeof ( int ), 1, fp[i] ) ;
                }
                i1 = i2 = 0 ;
                i == 2 ? ( i = 3 ) : ( i = 2 ) ;
            }

            if ( i2 == j )
            {
                flag1 = flag2 = 1 ;
                fwrite ( &p1, sizeof ( int ), 1, fp[i] ) ;
                for ( i1++ ; i1 < j ; i1++ )
                {
                    if ( fread ( &p1, sizeof ( int ), 1, fp[0] ) != 0 )
                        fwrite ( &p1, sizeof ( int ), 1, fp[i] ) ;
                }
                i1 = i2 = 0 ;
                i == 2 ? ( i = 3 ) : ( i = 2 ) ;
            }
        }

        fcloseall( ) ;
        remove ( fnames[0] ) ;
        remove ( fnames[1] ) ;
        rename ( fnames[2], fnames[0] ) ;
        rename ( fnames[3], fnames[1] ) ;
        j *= 2 ;
    }

    out :

    fcloseall( ) ;
    remove ( p ) ;
    rename ( fnames[0], p ) ;
    remove ( fnames[1] ) ;
}

Leave a Reply

Post Navigation