Sunday, November 8, 2015

[SD] External sort

1. Internal sort :
all sorting algorithm for memory is enough to process

2. External sort:
for memory is NOT enough to process
need to architect
https://karticks.wordpress.com/tag/map-reduce/

http://www.geeksforgeeks.org/sort-numbers-stored-on-different-machines/#tfbml-data%7B%22close_iframe%22%3Atrue%2C%22location_url%22%3A%22http%3A%2F%2Fwww.geeksforgeeks.org%2Fsort-numbers-stored-on-different-machines%2F%22%7D
1. Store the head pointers of the linked lists in a minHeap of size N where N is the number of machines.
2. Extract the minimum item from the minHeap. Update the minHeap by replacing the head of the minHeap with the next number from the linked list or by replacing the head of the minHeap with the last number of the minHeap followed by decreasing the size of heap by 1.
3. Repeat the above step2 until heap is not empty.

Data:short.MaxValue。

Memory size:1200 number( array of size)。

在这种场景下,我们决定每个文件放1000条,也就有33个小文件,也就有33个内存队列,每个队列取Top100条,Batch=500时刷新

硬盘,中转站存放33*2个数字(因为入中转站时打上了队列标记),最后内存活动最大总数为:sum=33(priority queue)*100(100 number / priority queue)+500(intermediate priority queue)+66(2 integer for each small file)=896<1200。
3. N way merge sort
1. divide and conquer
2. merge sort LogN layer and every two way merge take N so run time would be NLogN
3. Two way merge to N way merge
http://www.cnblogs.com/huangxincheng/archive/2012/12/19/2824943.html

// demo.txt
// input: max, create a file with number of max of lines of string
public static void createData(int max){
  var sw = new StreamWriter(Environment.CurrentDirectory + "//demo.txt");
  for (int i = 0 ;i < max; i++){
      Thread.sleep(2);
      var rand = new Random( (int)DateTime.Now.Ticks.Next(0, int.MaxValue >> 3)   ;
      sw.WriteLine(rand);
  }
  sw.close();
}
// 1.txt, 2.txt......33.txt
// Input : size (estimate a memory can handle size, so we chunk every small file with that size)
// And the file is sorted, small.OrderBy(i=>i).Select(i=>i).ToList();

public static int split(int size){
      int totalCount = 0;
      List small = new List();
      var sr = new StramReader(Environment.CurrentDirectory +"//demo.txt") );

      var pageSize = size;
      int pageCount = 0;
      int pageIndex = 0;
      while (true) {
           var line = sr.readLine();
           // Not yet done
           if (! string.IsNullOrEmpty(line)){

                 // Count to size for small
                 totalCount++;
                 small.Add(Convert.ToInt32(line));
                 if (totalCount % pageSize == 0){
                       pageIndex = totalCount/pageSize;
                       small = small.OrderBy(i=>i).Select(i=>i).ToList();
                       File.WriteAllLines(Environment.CurrentDirectory+"//"+pageIndex+".txt", small.Select(i = > i.toString()));
                       small.clear();
                 }
           }
           // Done
           else {
               pageCount = (int) Math.Ceiling((double)totalCount/pageSize);
                       small = small.OrderBy(i=>i).Select(i=>i).ToList();
                       File.WriteAllLines(Environment.CurrentDirectory+"//"+pageCount+".txt", small.Select(i = > i.toString()));
                       small.clear();           }
      }
      return pageCount; // how many small files
}
// result.txt
// add to Top N of small file to its corresponding priority queue and when empty(being processed to result), add another TopN into priority queue
public static void AddQueue(int i, List> list, ref int[] skip, int top =100){

     var result = File.ReadAllLines((Environment.CurrentDirectory+"//"+(i+1)+".txt")).Skip(skip[i]).Take(top).Select(j=> Convert.ToInt32(j));
     // put into PQ
     foreach (var item in result){
           list.get(i).Enqueue(null, item);
     }
     // next time skip number
     skip[i] += result.Count();

}
// Test
// size = 1200
// pageSize = 1000 lines (1000lines per file)
// pageCount = 33 (33 files)
// 33 files => 33 Priority Queues
// Build Priority Queue with Top100 from each file
// DISK sum=33*100+500+66=896<1200 data-blogger-escaped-logn="" data-blogger-escaped-pre="" data-blogger-escaped-time:o="">

public void main (){
      // 1. Data
      // Generate 2^15 data
      createData(short.MaxValue);
      // Number of lines in each small file
      var pageSize = 1000;
      // reset when achieve batchCount 
      var batchCount = 500;
      // Number of small files needed
      var pageCount = split(pageSize);

      // 2. Chunk
      // memory limit 1500 lines
      List> list = new List>();
      // Intermediate Converter
      PriorityQueue intermediateQueueControl = new PriorityQueue();
      // Status of each priority queue
      boolean[] complete = new boolean[pageCount];
      // All complete ?
      int allComplete = 0;
      // Define priority queues
      for (int i = 0 ; i < pageCount;i++){
         list.add(new PriorityQueue());
         addQueue(i, list, ref skip);
      }


      // 3. Merge
      for (int i = 0; i < list.size();i++){
            var temp = list.get(i).Dequeue();
            intermediateQueueControl.Enqueue(i, temp.level);
      }
      List batch = new List();
      int nextIndex = 0;
      while ( intermediateQueueControl.size() > 0  ) {
            // fetch data out
            var single = intermediateQueueControl.Dequeue();
            // next fetched data
            nextIndex = signle.t.vlaue;
            var nextData = list.get(nextIndex).Dequeue();
            // Empty, small file's priority queue empty
            if ( nextData == null ){
                // Fetch data from file
                AddQueue(nextIndex, list, ref skip);
                // Fetch non data, meaning File empty
                if (list.get(nextIndex).size() == 0){
                      complete[nextIndex] = true;
                      allComplete++;
                } else {
                      nextData = list.get(i).Dequeue();
                }
            }
            // Not empty, data go to intermediateQueueControl
            if ( nextData != null ){
               intermediateQueueControl.Enqueue(nextIndex,nextData.level);  
            }
            batch.add(single.level); 


            if (batch.count == batchCount || allComplete == pagecount)  {
                var sw = new StreamWriter(Environment.CurrentDirectory+"//result.txt", true);
                foreach(var item in batch){
                     sw.WriteLine(item);
                }
                sw.close();
                batch.Clear();
            } 
            Console.WriteLine("Done");
            Console.Read();      
      }


      // 4. clean
}

class ListNode {
   int data;
    ListNode next;
}
class MinHeapNode {
    ListNode head;
}
class MinHeap {
    int count;
    int capacity;
    MinHeapNode[] array;
}


public MinHeap createMinHeap (int capacity){
    MinHeap minHeap = new MinHeap;
    minHeap.capacity = capacity;
    minHeap.count = 0;// Initialize as ZERO
    minHeap.array = new MinHeapNode [minHeap.capacity];
    return minHeap; 
}

// Insert a new node at the beginning of the linked list
public ListNode push(ListNode head, int new_data){
    ListNode newNode = new ListNode();
    newNode.data= new_data;
    newNode.next = head;
    return newNode; 
}
public minHeapify(MinHeap minHeap, int idx){
      int left, right, smallest;
      left = 2*idx+1;
      right = 2*idx+2;
      smallest = idx;
      if ( left < minHeap.count && minHeap.array[left].head.data < minHeap.array[smallest].head.data ){
            smallest = left;
      }
      if ( right < minHeap.count && minHeap.array[right].head.data < minHeap.array[smallest].head.data ) {
            smallest = right;
      }
      if (smallest != idx){
            MinHeapNode tmp = minHeap.array[smallest];
            minHeap.array[smallest] = minHeap.array[idx];
            minHeap.array[idx] = tmp;
            minHeapify(minHeap, smallest);//**********
      }
}
public boolean isEmpty(MinHeap minHeap){
      return (minHeap.count == 0);
}
public void buildMinHeap(MinHeap minHEap){
      int n = minHEap.count;
      for ( int i = (n-2)/2; i >=0; i--  ){
          minHeapify(minHeap, i)
      }
}
public void populateMinHeap(MinHeap minHeap, ListNode[] array, int n){
       for (int i = 0 ; i < n; i ++){
             // count initialize as ZERO
             minHeap.array[minHeap.count++].head = array[i];
        }
        buildMinHeap(minHeap)
}
public ListNode extractMin(MinHeap minHeap){
       // validate the input  
      if (isEmpty(minHeap)){
           return null;
        }
      // relocate all nodes since idx 0 gone
       MinHeapNode tmp = minHeap.array[0];
       if (tmp.head.next){
            minHeap.array[0].head = tmp.head.next;
       // Empty, reduce the size
       } else {
            minHeap.array[0] = minHeap.array[minHEap.count-1];
            minHEap.count--;
       }
       minHeapify(minHeap, 0);
       return tmp.head;    
}
public void externalSort(LsitNode[] array, int N){
       MinHeap minHeap = createMinHeap(N);
       populateMinHeap(minHeap, array, N);
       while ( !isEmpty(minHeap) ){
            ListNode tmp = extractMin( minHEap );
            System.out.println(tmp.data);
       }
}
public static void main(String[] args){
       int N =3; // Number of machines
       ListNode[] array = new ListNode[3];
       array[0]= null;
       push(array[0],50);
       push(array[0], 40);
       push(array[0], 30);
       array[1] = null;
       push(array[1], 45);
       push(array[1], 35); // insert at the beginning
       array[2] = null;
        push(array[0],100);
       push(array[0], 80);
       push(array[0], 70); 
       push(array[0],60);
       push(array[0], 10);
       externalSort(array, N);
       return 0;
}

Output:

10 30 35 40 45 50 60 70 80 100
4. Reference: http://www.geeksforgeeks.org/sort-numbers-stored-on-different-machines/#tfbml-data%7B%22close_iframe%22%3Atrue%2C%22location_url%22%3A%22http%3A%2F%2Fwww.geeksforgeeks.org%2Fsort-numbers-stored-on-different-machines%2F%22%7D http://www.cnblogs.com/huangxincheng/archive/2012/12/19/2824943.html

No comments:

Post a Comment