all sorting algorithm for memory is enough to process
2. External sort:
for memory is NOT enough to process
need to architect
https://karticks.wordpress.com/tag/map-reduce/
http://www.geeksforgeeks.org/sort-numbers-stored-on-different-machines/#tfbml-data%7B%22close_iframe%22%3Atrue%2C%22location_url%22%3A%22http%3A%2F%2Fwww.geeksforgeeks.org%2Fsort-numbers-stored-on-different-machines%2F%22%7D
1. Store the head pointers of the linked lists in a minHeap of size N where N is the number of machines.
2. Extract the minimum item from the minHeap. Update the minHeap by replacing the head of the minHeap with the next number from the linked list or by replacing the head of the minHeap with the last number of the minHeap followed by decreasing the size of heap by 1.
3. Repeat the above step2 until heap is not empty.
Data:short.MaxValue。
Memory size:1200 number( array of size)。
在这种场景下,我们决定每个文件放1000条,也就有33个小文件,也就有33个内存队列,每个队列取Top100条,Batch=500时刷新
硬盘,中转站存放33*2个数字(因为入中转站时打上了队列标记),最后内存活动最大总数为:sum=33(priority queue)*100(100 number / priority queue)+500(intermediate priority queue)+66(2 integer for each small file)=896<1200。
3. N way merge sort
1. divide and conquer
2. merge sort LogN layer and every two way merge take N so run time would be NLogN
3. Two way merge to N way merge
http://www.cnblogs.com/huangxincheng/archive/2012/12/19/2824943.html
// demo.txt // input: max, create a file with number of max of lines of string public static void createData(int max){ var sw = new StreamWriter(Environment.CurrentDirectory + "//demo.txt"); for (int i = 0 ;i < max; i++){ Thread.sleep(2); var rand = new Random( (int)DateTime.Now.Ticks.Next(0, int.MaxValue >> 3) ; sw.WriteLine(rand); } sw.close(); }
// 1.txt, 2.txt......33.txt // Input : size (estimate a memory can handle size, so we chunk every small file with that size) // And the file is sorted, small.OrderBy(i=>i).Select(i=>i).ToList(); public static int split(int size){ int totalCount = 0; Listsmall = new List (); var sr = new StramReader(Environment.CurrentDirectory +"//demo.txt") ); var pageSize = size; int pageCount = 0; int pageIndex = 0; while (true) { var line = sr.readLine(); // Not yet done if (! string.IsNullOrEmpty(line)){ // Count to size for small totalCount++; small.Add(Convert.ToInt32(line)); if (totalCount % pageSize == 0){ pageIndex = totalCount/pageSize; small = small.OrderBy(i=>i).Select(i=>i).ToList(); File.WriteAllLines(Environment.CurrentDirectory+"//"+pageIndex+".txt", small.Select(i = > i.toString())); small.clear(); } } // Done else { pageCount = (int) Math.Ceiling((double)totalCount/pageSize); small = small.OrderBy(i=>i).Select(i=>i).ToList(); File.WriteAllLines(Environment.CurrentDirectory+"//"+pageCount+".txt", small.Select(i = > i.toString())); small.clear(); } } return pageCount; // how many small files }
// result.txt // add to Top N of small file to its corresponding priority queue and when empty(being processed to result), add another TopN into priority queue public static void AddQueue(int i, List> list, ref int[] skip, int top =100){ var result = File.ReadAllLines((Environment.CurrentDirectory+"//"+(i+1)+".txt")).Skip(skip[i]).Take(top).Select(j=> Convert.ToInt32(j)); // put into PQ foreach (var item in result){ list.get(i).Enqueue(null, item); } // next time skip number skip[i] += result.Count(); }
// Test // size = 1200 // pageSize = 1000 lines (1000lines per file) // pageCount = 33 (33 files) // 33 files => 33 Priority Queues // Build Priority Queue with Top100 from each file // DISK sum=33*100+500+66=896<1200 data-blogger-escaped-logn="" data-blogger-escaped-pre="" data-blogger-escaped-time:o=""> public void main (){ // 1. Data // Generate 2^15 data createData(short.MaxValue); // Number of lines in each small file var pageSize = 1000; // reset when achieve batchCount var batchCount = 500; // Number of small files needed var pageCount = split(pageSize); // 2. Chunk // memory limit 1500 lines List> list = new List >(); // Intermediate Converter PriorityQueue intermediateQueueControl = new PriorityQueue (); // Status of each priority queue boolean[] complete = new boolean[pageCount]; // All complete ? int allComplete = 0; // Define priority queues for (int i = 0 ; i < pageCount;i++){ list.add(new PriorityQueue ()); addQueue(i, list, ref skip); } // 3. Merge for (int i = 0; i < list.size();i++){ var temp = list.get(i).Dequeue(); intermediateQueueControl.Enqueue(i, temp.level); } List batch = new List (); int nextIndex = 0; while ( intermediateQueueControl.size() > 0 ) { // fetch data out var single = intermediateQueueControl.Dequeue(); // next fetched data nextIndex = signle.t.vlaue; var nextData = list.get(nextIndex).Dequeue(); // Empty, small file's priority queue empty if ( nextData == null ){ // Fetch data from file AddQueue(nextIndex, list, ref skip); // Fetch non data, meaning File empty if (list.get(nextIndex).size() == 0){ complete[nextIndex] = true; allComplete++; } else { nextData = list.get(i).Dequeue(); } } // Not empty, data go to intermediateQueueControl if ( nextData != null ){ intermediateQueueControl.Enqueue(nextIndex,nextData.level); } batch.add(single.level); if (batch.count == batchCount || allComplete == pagecount) { var sw = new StreamWriter(Environment.CurrentDirectory+"//result.txt", true); foreach(var item in batch){ sw.WriteLine(item); } sw.close(); batch.Clear(); } Console.WriteLine("Done"); Console.Read(); } // 4. clean }
class ListNode { int data; ListNode next; } class MinHeapNode { ListNode head; } class MinHeap { int count; int capacity; MinHeapNode[] array; } public MinHeap createMinHeap (int capacity){ MinHeap minHeap = new MinHeap; minHeap.capacity = capacity; minHeap.count = 0;// Initialize as ZERO minHeap.array = new MinHeapNode [minHeap.capacity]; return minHeap; } // Insert a new node at the beginning of the linked list public ListNode push(ListNode head, int new_data){ ListNode newNode = new ListNode(); newNode.data= new_data; newNode.next = head; return newNode; } public minHeapify(MinHeap minHeap, int idx){ int left, right, smallest; left = 2*idx+1; right = 2*idx+2; smallest = idx; if ( left < minHeap.count && minHeap.array[left].head.data < minHeap.array[smallest].head.data ){ smallest = left; } if ( right < minHeap.count && minHeap.array[right].head.data < minHeap.array[smallest].head.data ) { smallest = right; } if (smallest != idx){ MinHeapNode tmp = minHeap.array[smallest]; minHeap.array[smallest] = minHeap.array[idx]; minHeap.array[idx] = tmp; minHeapify(minHeap, smallest);//********** } } public boolean isEmpty(MinHeap minHeap){ return (minHeap.count == 0); } public void buildMinHeap(MinHeap minHEap){ int n = minHEap.count; for ( int i = (n-2)/2; i >=0; i-- ){ minHeapify(minHeap, i) } } public void populateMinHeap(MinHeap minHeap, ListNode[] array, int n){ for (int i = 0 ; i < n; i ++){ // count initialize as ZERO minHeap.array[minHeap.count++].head = array[i]; } buildMinHeap(minHeap) } public ListNode extractMin(MinHeap minHeap){ // validate the input if (isEmpty(minHeap)){ return null; } // relocate all nodes since idx 0 gone MinHeapNode tmp = minHeap.array[0]; if (tmp.head.next){ minHeap.array[0].head = tmp.head.next; // Empty, reduce the size } else { minHeap.array[0] = minHeap.array[minHEap.count-1]; minHEap.count--; } minHeapify(minHeap, 0); return tmp.head; } public void externalSort(LsitNode[] array, int N){ MinHeap minHeap = createMinHeap(N); populateMinHeap(minHeap, array, N); while ( !isEmpty(minHeap) ){ ListNode tmp = extractMin( minHEap ); System.out.println(tmp.data); } } public static void main(String[] args){ int N =3; // Number of machines ListNode[] array = new ListNode[3]; array[0]= null; push(array[0],50); push(array[0], 40); push(array[0], 30); array[1] = null; push(array[1], 45); push(array[1], 35); // insert at the beginning array[2] = null; push(array[0],100); push(array[0], 80); push(array[0], 70); push(array[0],60); push(array[0], 10); externalSort(array, N); return 0; } Output: 10 30 35 40 45 50 60 70 80 1004. Reference: http://www.geeksforgeeks.org/sort-numbers-stored-on-different-machines/#tfbml-data%7B%22close_iframe%22%3Atrue%2C%22location_url%22%3A%22http%3A%2F%2Fwww.geeksforgeeks.org%2Fsort-numbers-stored-on-different-machines%2F%22%7D http://www.cnblogs.com/huangxincheng/archive/2012/12/19/2824943.html
No comments:
Post a Comment