PageRank算法是谷歌曾經獨步天下的「倚天劍」,該算法由Larry Page和Sergey Brin在斯坦福大學讀研時發明的,論文點擊下載: The PageRank Citation Ranking: Bringing Order to the Web。css
n=6; i=[2 3 4 4 5 6 1 6 1]; j=[1 2 2 3 3 3 4 5 6]; G=sparse(i,j,1,n,n); % Power method for j = 1:n L{j} = find(G(:,j)); c(j) = length(L{j}); end p = .85; delta = (1-p)/n; x = ones(n,1)/n; z = zeros(n,1); cnt = 0; while max(abs(x-z)) > .0001 z = x; x = zeros(n,1); for j = 1:n if c(j) == 0 x = x + z(j)/n;%轉移到任意一個網頁 else x(L{j}) = x(L{j}) + z(j)/c(j);%將上次的pagerank值平攤給全部指向的網頁 end end x = p*x + delta; cnt = cnt+1; end
x = 0.2675 0.2524 0.1323 0.1698 0.0625 0.1156
# coding=utf-8 # python-graph # Import graphviz import graphviz as gv # Import pygraph from pygraph.classes.digraph import digraph from import write # Define pagerank function def pagerank(graph, damping_factor=0.85, max_iterations=100, \ min_delta=0.00001): """ Compute and return the PageRank in an directed graph. @type graph: digraph @param graph: Digraph. @type damping_factor: number @param damping_factor: PageRank dumping factor. @type max_iterations: number @param max_iterations: Maximum number of iterations. @type min_delta: number @param min_delta: Smallest variation required for a new iteration. @rtype: Dict @return: Dict containing all the nodes PageRank. """ nodes = graph.nodes() graph_size = len(nodes) if graph_size == 0: return {} # value for nodes without inbound links min_value = (1.0-damping_factor)/graph_size # itialize the page rank dict with 1/N for all nodes #pagerank = dict.fromkeys(nodes, 1.0/graph_size) pagerank = dict.fromkeys(nodes, 1.0) for i in range(max_iterations): diff = 0 #total difference compared to last iteraction # computes each node PageRank based on inbound links for node in nodes: rank = min_value for referring_page in graph.incidents(node): rank += damping_factor * pagerank[referring_page] / \ len(graph.neighbors(referring_page)) diff += abs(pagerank[node] - rank) pagerank[node] = rank print 'This is NO.%s iteration' % (i+1) print pagerank print '' #stop if PageRank has converged if diff < min_delta: break return pagerank # Graph creation gr = digraph() # Add nodes and edges gr.add_nodes(["1","2","3","4"]) gr.add_edge(("1","2")) gr.add_edge(("1","3")) gr.add_edge(("1","4")) gr.add_edge(("2","3")) gr.add_edge(("2","4")) gr.add_edge(("3","4")) gr.add_edge(("4","2")) # Draw as PNG # dot = write(gr) # gvv = gv.readstring(dot) # gv.layout(gvv,'dot') # gv.render(gvv,'png','Model.png') pagerank(gr)
This is NO.32 iteration {'1': 0.2675338708706491, '3': 0.13227261904986046, '2': 0.2524037902400518, '5': 0.062477242064127136, '4': 0.1697488529161491, '6': 0.1155828978186352}
function x = pagerank1(G) % PAGERANK1 Google's PageRank modified version 1 - hujiawei %if nargin < 3, p = .85; end p=0.85; % Eliminate any self-referential links G = G - diag(diag(G)); % c = out-degree, r = in-degree [n,n] = size(G); c = sum(G,1);%each row's sum r = sum(G,2);%each col's sum % Scale column sums to be 1 (or 0 where there are no out links). k = find(c~=0); D = sparse(k,k,1./c(k),n,n); % Solve (I - p*G*D)*x = e e = ones(n,1); I = speye(n,n); x = (I - p*G*D)\e; % Normalize so that sum(x) == 1. x = x/sum(x);
function x = pagerank2(G) % PAGERANK1 Google's PageRank modified version 2 - hujiawei % using inverse iteration method %if nargin < 3, p = .85; end p=0.85; % Eliminate any self-referential links G = G - diag(diag(G)); % c = out-degree, r = in-degree [n,n] = size(G); c = sum(G,1);%each row's sum r = sum(G,2);%each col's sum % Scale column sums to be 1 (or 0 where there are no out links). k = find(c~=0); D = sparse(k,k,1./c(k),n,n); % Solve (I - p*G*D)*x = e e = ones(n,1); I = speye(n,n); % x = (I - p*G*D)\e; delta=(1-p)/n; A=p*G*D+delta; x=(I-A)\e; % Normalize so that sum(x) == 1. x = x/sum(x);
function [U,G] = surfer(root,n) % SURFER Create the adjacency graph of a portion of the Web. % [U,G] = surfer(root,n) starts at the URL root and follows % Web links until it forms an adjacency graph with n nodes. % U = a cell array of n strings, the URLs of the nodes. % G = an n-by-n sparse matrix with G(i,j)=1 if node j is linked to node i. % % Example: [U,G] = surfer('',500); % See also PAGERANK. % % This function currently has two defects. (1) The algorithm for % finding links is naive. We just look for the string 'http:'. % (2) An attempt to read from a URL that is accessible, but very slow, % might take an unacceptably long time to complete. In some cases, % it may be necessary to have the operating system terminate MATLAB. % Key words from such URLs can be added to the skip list in surfer.m. % Initialize clf shg set(gcf,'doublebuffer','on') axis([0 n 0 n]) axis square axis ij box on set(gca,'position',[.12 .20 .78 .78]) uicontrol('style','frame','units','normal','position',[.01 .09 .98 .07]); uicontrol('style','frame','units','normal','position',[.01 .01 .98 .07]); t1 = uicontrol('style','text','units','normal','position',[.02 .10 .94 .04], ... 'horiz','left'); t2 = uicontrol('style','text','units','normal','position',[.02 .02 .94 .04], ... 'horiz','left'); slow = uicontrol('style','toggle','units','normal', ... 'position',[.01 .24 .07 .05],'string','slow','value',0); quit = uicontrol('style','toggle','units','normal', ... 'position',[.01 .17 .07 .05],'string','quit','value',0); U = cell(n,1); hash = zeros(n,1); G = logical(sparse(n,n)); m = 1; U{m} = root; hash(m) = hashfun(root); j = 1; while j < n & get(quit,'value') == 0 % Try to open a page. try set(t1,'string',sprintf('%5d %s',j,U{j})) set(t2,'string',''); drawnow page = urlread(U{j}); catch set(t1,'string',sprintf('fail: %5d %s',j,U{j})) drawnow continue end if get(slow,'value') pause(.25) end % Follow the links from the open page. for f = findstr('http:',page); % A link starts with 'http:' and ends with the next quote. e = min([findstr('"',page(f:end)) findstr('''',page(f:end))]); if isempty(e), continue, end url = deblank(page(f:f+e-2)); url(url<' ') = '!'; % Nonprintable characters if url(end) == '/', url(end) = []; end % Look for links that should be skipped. skips = {'.gif','.jpg','.pdf','.css','lmscadsi','cybernet', ... 'search.cgi','.ram','', ... 'scripts','netscape','shockwave','webex','fansonly'}; skip = any(url=='!') | any(url=='?'); k = 0; while ~skip & (k < length(skips)) k = k+1; skip = ~isempty(findstr(url,skips{k})); end if skip if isempty(findstr(url,'.gif')) & isempty(findstr(url,'.jpg')) set(t2,'string',sprintf('skip: %s',url)) drawnow if get(slow,'value') pause(.25) end end continue end % Check if page is already in url list. i = 0; for k = find(hash(1:m) == hashfun(url))'; if isequal(U{k},url) i = k; break end end % Add a new url to the graph there if are fewer than n. if (i == 0) & (m < n) m = m+1; U{m} = url; hash(m) = hashfun(url); i = m; end % Add a new link. if i > 0 G(i,j) = 1; set(t2,'string',sprintf('%5d %s',i,url)) line(j,i,'marker','.','markersize',6) drawnow if get(slow,'value') pause(.25) end end end j = j+1; end delete(t1) delete(t2) delete(slow) set(quit,'string','close','callback','close(gcf)','value',0) %------------------------ function h = hashfun(url) % Almost unique numeric hash code for pages already visited. h = length(url) + 1024*sum(url);
process the data to the form of {node i:[its adjacent node list],...} while the sum of difference between the last two pagerank values < threshold map({node i:[its adjacent node list],...}): map_output={} for every node j in adjacent node list: put or sum up {j:(i, PageRank(i)/length(adjacent node list))} into map_output return map_output reduce(map_output): reduce_output={} for every entry {j:(i, PageRank(i)/length(adjacent node list))} in map_output: put or sum up all values pagerank values for node j with its adjacent node list into reduce_output return reduce_output
其中,關於用戶1和2的數據被mapperA讀取並處理,關於用戶3和4的數據被mapperB讀取並處理 [經驗證,即便一個用戶的數據是由不一樣的mapper來讀取的,最終收斂到的結果差很少]
(1)首先是使用Matlab採用冪法的方式計算出在p=1.0的狀況下示例獲得的結果 [它的主要做用是驗證後面python版本的正確性]
n=4; i=[2 3 4 3 4 4 1 2]; j=[1 1 1 2 2 3 3 4]; G=sparse(i,j,1,n,n); [n,n] = size(G); for j = 1:n L{j} = find(G(:,j)); c(j) = length(L{j}); end % Power method p=1.0; delta = (1-p)/n; x = ones(n,1)/n; z = zeros(n,1); cnt = 0; while max(abs(x-z)) > .0001 z = x; x = zeros(n,1); for j = 1:n if c(j) == 0 x = x + z(j)/n; else x(L{j}) = x(L{j}) + z(j)/c(j); end end x = p*x + delta; cnt = cnt+1; end sprintf('pagerank result:') x
0.1072 0.3571 0.2143 0.3214
(2)matlab版本的page rank沒有采用mapreduce的思想進行迭代,因此我另外寫了一個python版本的利用mapreduce思想實現的pagerank算法(注:我並無使用python的map和reduce函數去實現,而是使用更加容易明白的實現),使用的閾值爲0.0001,最多迭代的次數爲100次。
# coding=utf-8 __author__ = 'hujiawei' __doc__ = 'pagerank mapreduce' class Node: def __init__(self,id,pk): def pk_map(map_input): map_output={} for node,outlinks in map_input.items(): for link in outlinks: size=len(outlinks) if link in map_output: map_output[link]+=(float)( else: map_output[link]=(float)( return map_output def pk_reduce(reduce_input): for result in reduce_input: for node,value in result.items(): def pk_clear(nodes): for node in nodes: def pk_last(nodes): lastnodes=[] for node in nodes: lastnodes.append(Node(, return lastnodes def pk_diff(nodes,lastnodes): diff=0 for i in range(len(nodes)): print('node pk %f, last node pk %f ' % (nodes[i].pk, lastnodes[i].pk)) diff+=abs(nodes[i].pk-lastnodes[i].pk) return diff def pk_test1(): node1 = Node(1, 0.25) node2 = Node(2, 0.25) node3 = Node(3, 0.25) node4 = Node(4, 0.25) nodes = [node1, node2, node3, node4] threshold = 0.0001 max_iters = 100 for iter_count in range(max_iters): iter_count += 1 lastnodes=pk_last(nodes) print('============ map count %d =================' % (iter_count)) in1 = {node1: [node2, node3, node4], node2: [node3, node4]} in2 = {node3: [node1, node4], node4: [node2]} mapout1 = pk_map(in1) mapout2 = pk_map(in2) for node, value in mapout1.items(): print str( + ' ' + str(value) for node, value in mapout2.items(): print str( + ' ' + str(value) print('============ reduce count %d =================' % (iter_count)) reducein = [mapout1, mapout2] pk_clear(nodes) pk_reduce(reducein) for node in nodes: print str( + ' ' + str( diff=pk_diff(nodes,lastnodes) if diff < threshold: break if __name__ == '__main__': pk_test1()
1 0.107138774577 2 0.35712924859 3 0.214296601128 4 0.321435375705
OK,差很少了,但願對須要理解PageRank算法的人有幫助! :-)