Sunday, November 8, 2015
[SD] External sort
1. Internal sort :
all sorting algorithm for memory is enough to process
2. External sort:
for memory is NOT enough to process
need to architect
https://karticks.wordpress.com/tag/map-reduce/

http://www.geeksforgeeks.org/sort-numbers-stored-on-different-machines/#tfbml-data%7B%22close_iframe%22%3Atrue%2C%22location_url%22%3A%22http%3A%2F%2Fwww.geeksforgeeks.org%2Fsort-numbers-stored-on-different-machines%2F%22%7D
1. Store the head pointers of the linked lists in a minHeap of size N where N is the number of machines.
2. Extract the minimum item from the minHeap. Update the minHeap by replacing the head of the minHeap with the next number from the linked list or by replacing the head of the minHeap with the last number of the minHeap followed by decreasing the size of heap by 1.
3. Repeat the above step2 until heap is not empty.
Data:short.MaxValue。
Memory size:1200 number( array of size)。
在这种场景下,我们决定每个文件放1000条,也就有33个小文件,也就有33个内存队列,每个队列取Top100条,Batch=500时刷新
硬盘,中转站存放33*2个数字(因为入中转站时打上了队列标记),最后内存活动最大总数为:sum=33(priority queue)*100(100 number / priority queue)+500(intermediate priority queue)+66(2 integer for each small file)=896<1200。
3. N way merge sort
1. divide and conquer
2. merge sort LogN layer and every two way merge take N so run time would be NLogN
3. Two way merge to N way merge
http://www.cnblogs.com/huangxincheng/archive/2012/12/19/2824943.html

4. Reference:
http://www.geeksforgeeks.org/sort-numbers-stored-on-different-machines/#tfbml-data%7B%22close_iframe%22%3Atrue%2C%22location_url%22%3A%22http%3A%2F%2Fwww.geeksforgeeks.org%2Fsort-numbers-stored-on-different-machines%2F%22%7D
http://www.cnblogs.com/huangxincheng/archive/2012/12/19/2824943.html
all sorting algorithm for memory is enough to process
2. External sort:
for memory is NOT enough to process
need to architect
https://karticks.wordpress.com/tag/map-reduce/

http://www.geeksforgeeks.org/sort-numbers-stored-on-different-machines/#tfbml-data%7B%22close_iframe%22%3Atrue%2C%22location_url%22%3A%22http%3A%2F%2Fwww.geeksforgeeks.org%2Fsort-numbers-stored-on-different-machines%2F%22%7D
1. Store the head pointers of the linked lists in a minHeap of size N where N is the number of machines.
2. Extract the minimum item from the minHeap. Update the minHeap by replacing the head of the minHeap with the next number from the linked list or by replacing the head of the minHeap with the last number of the minHeap followed by decreasing the size of heap by 1.
3. Repeat the above step2 until heap is not empty.
Data:short.MaxValue。
Memory size:1200 number( array of size)。
在这种场景下,我们决定每个文件放1000条,也就有33个小文件,也就有33个内存队列,每个队列取Top100条,Batch=500时刷新
硬盘,中转站存放33*2个数字(因为入中转站时打上了队列标记),最后内存活动最大总数为:sum=33(priority queue)*100(100 number / priority queue)+500(intermediate priority queue)+66(2 integer for each small file)=896<1200。
3. N way merge sort
1. divide and conquer
2. merge sort LogN layer and every two way merge take N so run time would be NLogN
3. Two way merge to N way merge
http://www.cnblogs.com/huangxincheng/archive/2012/12/19/2824943.html
// demo.txt // input: max, create a file with number of max of lines of string public static void createData(int max){ var sw = new StreamWriter(Environment.CurrentDirectory + "//demo.txt"); for (int i = 0 ;i < max; i++){ Thread.sleep(2); var rand = new Random( (int)DateTime.Now.Ticks.Next(0, int.MaxValue >> 3) ; sw.WriteLine(rand); } sw.close(); }
// 1.txt, 2.txt......33.txt // Input : size (estimate a memory can handle size, so we chunk every small file with that size) // And the file is sorted, small.OrderBy(i=>i).Select(i=>i).ToList(); public static int split(int size){ int totalCount = 0; Listsmall = new List (); var sr = new StramReader(Environment.CurrentDirectory +"//demo.txt") ); var pageSize = size; int pageCount = 0; int pageIndex = 0; while (true) { var line = sr.readLine(); // Not yet done if (! string.IsNullOrEmpty(line)){ // Count to size for small totalCount++; small.Add(Convert.ToInt32(line)); if (totalCount % pageSize == 0){ pageIndex = totalCount/pageSize; small = small.OrderBy(i=>i).Select(i=>i).ToList(); File.WriteAllLines(Environment.CurrentDirectory+"//"+pageIndex+".txt", small.Select(i = > i.toString())); small.clear(); } } // Done else { pageCount = (int) Math.Ceiling((double)totalCount/pageSize); small = small.OrderBy(i=>i).Select(i=>i).ToList(); File.WriteAllLines(Environment.CurrentDirectory+"//"+pageCount+".txt", small.Select(i = > i.toString())); small.clear(); } } return pageCount; // how many small files }
// result.txt // add to Top N of small file to its corresponding priority queue and when empty(being processed to result), add another TopN into priority queue public static void AddQueue(int i, List> list, ref int[] skip, int top =100){ var result = File.ReadAllLines((Environment.CurrentDirectory+"//"+(i+1)+".txt")).Skip(skip[i]).Take(top).Select(j=> Convert.ToInt32(j)); // put into PQ foreach (var item in result){ list.get(i).Enqueue(null, item); } // next time skip number skip[i] += result.Count(); }
// Test // size = 1200 // pageSize = 1000 lines (1000lines per file) // pageCount = 33 (33 files) // 33 files => 33 Priority Queues // Build Priority Queue with Top100 from each file // DISK sum=33*100+500+66=896<1200 data-blogger-escaped-logn="" data-blogger-escaped-pre="" data-blogger-escaped-time:o=""> public void main (){ // 1. Data // Generate 2^15 data createData(short.MaxValue); // Number of lines in each small file var pageSize = 1000; // reset when achieve batchCount var batchCount = 500; // Number of small files needed var pageCount = split(pageSize); // 2. Chunk // memory limit 1500 lines List> list = new List >(); // Intermediate Converter PriorityQueue intermediateQueueControl = new PriorityQueue (); // Status of each priority queue boolean[] complete = new boolean[pageCount]; // All complete ? int allComplete = 0; // Define priority queues for (int i = 0 ; i < pageCount;i++){ list.add(new PriorityQueue ()); addQueue(i, list, ref skip); } // 3. Merge for (int i = 0; i < list.size();i++){ var temp = list.get(i).Dequeue(); intermediateQueueControl.Enqueue(i, temp.level); } List batch = new List (); int nextIndex = 0; while ( intermediateQueueControl.size() > 0 ) { // fetch data out var single = intermediateQueueControl.Dequeue(); // next fetched data nextIndex = signle.t.vlaue; var nextData = list.get(nextIndex).Dequeue(); // Empty, small file's priority queue empty if ( nextData == null ){ // Fetch data from file AddQueue(nextIndex, list, ref skip); // Fetch non data, meaning File empty if (list.get(nextIndex).size() == 0){ complete[nextIndex] = true; allComplete++; } else { nextData = list.get(i).Dequeue(); } } // Not empty, data go to intermediateQueueControl if ( nextData != null ){ intermediateQueueControl.Enqueue(nextIndex,nextData.level); } batch.add(single.level); if (batch.count == batchCount || allComplete == pagecount) { var sw = new StreamWriter(Environment.CurrentDirectory+"//result.txt", true); foreach(var item in batch){ sw.WriteLine(item); } sw.close(); batch.Clear(); } Console.WriteLine("Done"); Console.Read(); } // 4. clean }
class ListNode { int data; ListNode next; } class MinHeapNode { ListNode head; } class MinHeap { int count; int capacity; MinHeapNode[] array; } public MinHeap createMinHeap (int capacity){ MinHeap minHeap = new MinHeap; minHeap.capacity = capacity; minHeap.count = 0;// Initialize as ZERO minHeap.array = new MinHeapNode [minHeap.capacity]; return minHeap; } // Insert a new node at the beginning of the linked list public ListNode push(ListNode head, int new_data){ ListNode newNode = new ListNode(); newNode.data= new_data; newNode.next = head; return newNode; } public minHeapify(MinHeap minHeap, int idx){ int left, right, smallest; left = 2*idx+1; right = 2*idx+2; smallest = idx; if ( left < minHeap.count && minHeap.array[left].head.data < minHeap.array[smallest].head.data ){ smallest = left; } if ( right < minHeap.count && minHeap.array[right].head.data < minHeap.array[smallest].head.data ) { smallest = right; } if (smallest != idx){ MinHeapNode tmp = minHeap.array[smallest]; minHeap.array[smallest] = minHeap.array[idx]; minHeap.array[idx] = tmp; minHeapify(minHeap, smallest);//********** } } public boolean isEmpty(MinHeap minHeap){ return (minHeap.count == 0); } public void buildMinHeap(MinHeap minHEap){ int n = minHEap.count; for ( int i = (n-2)/2; i >=0; i-- ){ minHeapify(minHeap, i) } } public void populateMinHeap(MinHeap minHeap, ListNode[] array, int n){ for (int i = 0 ; i < n; i ++){ // count initialize as ZERO minHeap.array[minHeap.count++].head = array[i]; } buildMinHeap(minHeap) } public ListNode extractMin(MinHeap minHeap){ // validate the input if (isEmpty(minHeap)){ return null; } // relocate all nodes since idx 0 gone MinHeapNode tmp = minHeap.array[0]; if (tmp.head.next){ minHeap.array[0].head = tmp.head.next; // Empty, reduce the size } else { minHeap.array[0] = minHeap.array[minHEap.count-1]; minHEap.count--; } minHeapify(minHeap, 0); return tmp.head; } public void externalSort(LsitNode[] array, int N){ MinHeap minHeap = createMinHeap(N); populateMinHeap(minHeap, array, N); while ( !isEmpty(minHeap) ){ ListNode tmp = extractMin( minHEap ); System.out.println(tmp.data); } } public static void main(String[] args){ int N =3; // Number of machines ListNode[] array = new ListNode[3]; array[0]= null; push(array[0],50); push(array[0], 40); push(array[0], 30); array[1] = null; push(array[1], 45); push(array[1], 35); // insert at the beginning array[2] = null; push(array[0],100); push(array[0], 80); push(array[0], 70); push(array[0],60); push(array[0], 10); externalSort(array, N); return 0; } Output: 10 30 35 40 45 50 60 70 80 100
[SD] Bloom filter [DONE]
import java.util.BitSet; public class simplebloomfilter { private static final int default_size = 2 << 24 ; private static final int [] seeds =new int []{5,7, 11 , 13 , 41 , 43 , 61}; private BitSet bits= new BitSet(default_size); private simplehash[] func=new simplehash[seeds.length]; public static void main(String[] args) { String value = "simplebloomfilter@gmail.cn" ; simplebloomfilter filter=new simplebloomfilter (); System.out.println(filter.contains(value)); filter.add(value); System.out.println(filter.contains(value)); } public simplebloomfilter () { for( int i= 0 ; i< seeds.length; i ++ ) { func[i]=new simplehash(default_size, seeds[i]); } } public void add(String value) { for(simplehash f : func) { bits.set(f.hash(value), true ); } } public boolean contains(String value) { if(value ==null ) { return false ; } boolean ret = true ; for(simplehash f : func) { ret=ret&& bits.get(f.hash(value)); } return ret; } public static class simplehash { private int cap; private int seed; public simplehash( int cap, int seed) { this.cap= cap; this.seed =seed; } public int hash(String value) { int result=0 ; int len= value.length(); for (int i= 0 ; i< len; i ++ ) { result =seed* result + value.charAt(i);// Rolling Hash System.out.println(result); } return (cap - 1 ) & result;// 2 << 24 - 1 } } }
// http://www.programgo.com/article/42342753493/#tfbml-data%7B%22iframe_height%22%3A162%2C%22location_url%22%3A%22http%3A%2F%2Fwww.programgo.com%2Farticle%2F42342753493%2F%22%7D //----------------------------------------------------------------------------- // MurmurHash2, by Austin Appleby // Note - This code makes a few assumptions about how your machine behaves - // 1. We can read a 4-byte value from any address without crashing // 2. sizeof(int) == 4 // And it has a few limitations - // 1. It will not work incrementally. // 2. It will not produce the same results on little-endian and big-endian // machines. unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) { // 'm' and 'r' are mixing constants generated offline. // They're not really 'magic', they just happen to work well. const unsigned int m = 0x5bd1e995; const int r = 24; // Initialize the hash to a 'random' value unsigned int h = seed ^ len; // Mix 4 bytes at a time into the hash const unsigned char * data = (const unsigned char *)key; while(len >= 4) { unsigned int k = *(unsigned int *)data; k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; data += 4; len -= 4; } // Handle the last few bytes of the input array switch(len) { case 3: h ^= data[2] << 16; case 2: h ^= data[1] << 8; case 1: h ^= data[0]; h *= m; }; // Do a few final mixes of the hash to ensure the last few // bytes are well-incorporated. h ^= h >> 13; h *= m; h ^= h >> 15; return h; } #includeusing namespace std; unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ); int main() { unsigned int result = MurmurHash2("abcd",4,1); cout<
Monday, November 2, 2015
[Quick select algorithm]find the Kth element in a list in linear time
Source:http://cse-wiki.unl.edu/wiki/index.php/Sorting_Algorithms


Source: http://blog.teamleadnet.com/2012/07/quick-select-algorithm-find-kth-element.html

Source: http://blog.teamleadnet.com/2012/07/quick-select-algorithm-find-kth-element.html
Arr = [5 1 4 3 2] Pivot = [4] Steps: swap [5] and [2] as 5>=4 and 2< [2 1 4 3 5] swap [4] and [3] as 4>=4 and 3<4 [2 1 3 4 5]
Arr = [2 1 3 …] Pivot = [1] Steps: swap [2] and [1] as 2>=2 and 1<2 [1 2 3 …]
Arr = […2 3…] Pivot= [2]// Quick Select O(n) instead of O(nlogn) for large dataset
// Source :http://blog.teamleadnet.com/2012/07/quick-select-algorithm-find-kth-element.html public static int selectKth(int[] arr, int k) { if (arr == null || arr.length <= k) throw new Error(); int from = 0, to = arr.length - 1; // if from == to we reached the kth element while (from < to) { int r = from, w = to; int mid = arr[(r + w) / 2]; // stop if the reader and writer meets while (r < w) { if (arr[r] >= mid) { // put the large values at the end int tmp = arr[w]; arr[w] = arr[r]; arr[r] = tmp; w--; } else { // the value is smaller than the pivot, skip r++; } } // if we stepped up (r++) we need to step one down if (arr[r] > mid) r--; // the r pointer is on the end of the first k elements if (k <= r) { to = r; } else { from = r + 1; } } return arr[k]; }
Subscribe to:
Posts (Atom)