阿里電話面試問題----100萬個URL怎樣找到出現頻率最高的前100個？

時間 2019-12-05

標籤阿里電話面試問題萬個 url 怎樣找到出現頻率最高欄目阿里巴巴简体版

原文原文鏈接

內推阿里電話面試中面試官給我出的一個題：html

我想的頭一個解決方式。就是放到stl 的map裏面對出現的頻率做爲pair的第二個字段進行排序。以後依照排序結果返回：python

如下口說無憑，show your code，固然在討論帖子中遭遇了project界大牛的sql代碼在技術上的碾壓。什麼是作project的，什麼是project師的思惟。不要一味的埋頭搞算法。ios

討論帖：c++

http://bbs.csdn.net/topics/391080906web

python 抓取百度搜索結果的討論貼：面試

http://bbs.csdn.net/topics/391077668算法

實驗數據。python從百度抓得：sql

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""


import urllib2 
import re 
import os

#connect to a URL 
#一頁的搜索結果中url大概是200個左右
file_url = open('url.txt','ab+')
#搜索框裏的東西,這塊可以設置成數字好讓每次搜索的結果不同
search = '123'
url = "http://www.baidu.com/s?
wd="+search


def setUrlToFile():
    website = urllib2.urlopen(url) 
    #read html code 

    html = website.read() 

    #use re.findall to get all the links 

    links = re.findall('"((http|ftp)s?數據庫
://.*?)"', html)
 

    for s in links:
        print s[0]
        if len(s[0]) < 256:
            file_url.write(s[0]+'\r\n')
    
#收集實驗數據
for i in range(0,50):
    setUrlToFile()

file_url.close()


###需要又一次打開再讀一下
file_url = open('url.txt','r')
file_lines = len(file_url.readlines())
print "there are %d url in %s" %(file_lines,file_url)
file_url.close()

post

方法1：

c++ 寫的讀 url.txt放到map裏面

對map<string , int>的value進行排序，獲得前100個

執行一下也就55s，仍是很是快的。url長度進行了限制小於256個字符

#pragma once
/*
//計算代碼段執行時間的類
//
*/
#include <iostream>

#ifndef ComputeTime_h
#define ComputeTime_h


//單位毫秒

class   ComputeTime    
{  
private:  
	int Initialized;  
	__int64 Frequency;  
	__int64 BeginTime;  
		    
public:  

	bool Avaliable();  
	double End();  
	bool Begin();  
	ComputeTime();  
	virtual   ~ComputeTime();    

};  






#endif
#include "stdafx.h"
#include "ComputeTime.h"
#include <iostream>
#include <Windows.h>

ComputeTime::ComputeTime()  
{  
	Initialized=QueryPerformanceFrequency((LARGE_INTEGER   *)&Frequency);  
}  
   
 ComputeTime::~ComputeTime()  
{  
		    
}  
   
 bool   ComputeTime::Begin()  
{  
	if(!Initialized)  
		return 0;

	 return   QueryPerformanceCounter((LARGE_INTEGER   *)&BeginTime);  
 }
     
 double   ComputeTime::End()
{  
	 if(!Initialized)  
		return 0;

		   
	 __int64   endtime;  
		   
	 QueryPerformanceCounter((LARGE_INTEGER   *)&endtime);  
		    
		  
	 __int64   elapsed = endtime-BeginTime;  
		    
		  
	 return   ((double)elapsed/(double)Frequency)*1000.0;  //單位毫秒
 }  

 bool   ComputeTime::Avaliable()
{  
	 return Initialized;  
}   


// sortUrl.cpp : 定義控制檯應用程序的入口點。
//

#include "stdafx.h"
//#include <utility>    
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include "ComputeTime.h"

using namespace std;

map<string,int> urlfrequency;


typedef pair<string, int> PAIR;


struct CmpByValue 
{
	bool operator()(const PAIR& lhs, const PAIR& rhs) 
	{
		return lhs.second > rhs.second;
	}
};

void find_largeTH(map<string,int> urlfrequency)
{
	//把map中元素轉存到vector中 ,依照value排序
	vector<PAIR> url_quency_vec(urlfrequency.begin(), urlfrequency.end());
	sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue());
	//url_quency_vec.size()
	for (int i = 0; i != 100; ++i) 
	{
		cout<<url_quency_vec[i].first<<endl;
		cout<<url_quency_vec[i].second<<endl;
	}
}


//urlheap的創建過程，URL插入時候存在的
void insertUrl(string url)
{
	pair<map<string ,int>::iterator, bool> Insert_Pair;
	Insert_Pair = urlfrequency.insert(map<string, int>::value_type(url,1));



	if (Insert_Pair.second == false)
	{
		(Insert_Pair.first->second++);
	}
	

}


int _tmain(int argc, _TCHAR* argv[])
{
	fstream URLfile;
	char buffer[1024]; 
	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())  
	{ cout << "Error opening file"; exit (1); } 
	else
	{
	cout<<"open file success!"<<endl;
	}

	ComputeTime cp;
	cp.Begin();
	int i = 0;
	 while (!URLfile.eof())  
	{  
	URLfile.getline (buffer,1024);  
	//cout << buffer << endl;  
	string temp(buffer);
	//cout<<i++<<endl;
	insertUrl(temp);
	}  
	      


	find_largeTH(urlfrequency);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();
	//system("pause");
	return 0;
}

實驗結果：55s還不算太差。可以接受，畢竟是頭腦中的第一個解決方式。

方法2：

hash code 版本號。僅僅是不知道怎麼 hash和url關聯起來：

// urlFind.cpp : 定義控制檯應用程序的入口點。

//

// sortUrl.cpp : 定義控制檯應用程序的入口點。

//

#include "stdafx.h"
 
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include <unordered_map>
#include "ComputeTime.h"

using namespace std;

map<unsigned int,int> urlhash;


typedef pair<unsigned int, int> PAIR;


struct info{
	string url;
	int cnt;
	bool operator<(const info &r) const {
		return cnt>r.cnt;
	}
};


unordered_map<string,int> count;

//priority_queue<info> pq;


struct CmpByValue 
{
	bool operator()(const PAIR& lhs, const PAIR& rhs) 
	{
		return lhs.second > rhs.second;
	}
};

void find_largeTH(map<unsigned int,int> urlhash)
{
	//把map中元素轉存到vector中 ,依照value排序
	vector<PAIR> url_quency_vec(urlhash.begin(), urlhash.end());
	sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue());
	//url_quency_vec.size()
	for (int i = 0; i != 100; ++i) 
	{
		cout<<url_quency_vec[i].first<<endl;
		cout<<url_quency_vec[i].second<<endl;
	}
}


// BKDR Hash Function
unsigned int BKDRHash(char *str)
{
	unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
	unsigned int hash = 0;

	while (*str)
	{
		hash = hash * seed + (*str++);
	}

	return (hash & 0x7FFFFFFF);
}

//
void insertUrl(string url)
{

	unsigned int hashvalue = BKDRHash((char *)url.c_str());
	pair<map<unsigned int ,int>::iterator, bool> Insert_Pair;
	Insert_Pair = urlhash.insert(map<unsigned int, int>::value_type(hashvalue,1));

	if (Insert_Pair.second == false)
	{
		(Insert_Pair.first->second++);
	}


}


int _tmain(int argc, _TCHAR* argv[])
{
	fstream URLfile;
	char buffer[1024]; 
	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())  
	{ cout << "Error opening file"; exit (1); } 
	else
	{
		cout<<"open file success!"<<endl;
	}

	ComputeTime cp;
	cp.Begin();
	int i = 0;
	while (!URLfile.eof())  
	{  
		URLfile.getline (buffer,1024);  
		//cout << buffer << endl;  
		string temp(buffer);
		//cout<<i++<<endl;
		insertUrl(temp);
	}  



	find_largeTH(urlhash);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();
	//system("pause");
	return 0;
}

性能15秒左右：缺點在於沒有把hashcode和url進行關聯，技術的處理速度已經很可觀了

方法3：

如下用STL的hash容器unordered_map。和優先隊列(就是堆)來實現這個問題。

// urlFind.cpp : 定義控制檯應用程序的入口點。

//

// sortUrl.cpp : 定義控制檯應用程序的入口點。
//

#include "stdafx.h"
 
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include <unordered_map>
#include <queue>
#include "ComputeTime.h"

using namespace std;


typedef pair<string, int> PAIR;


struct info
{
	string url;
	int cnt;
	bool operator<(const info &r) const
	{
		return cnt<r.cnt;
	}
};


unordered_map<string,int> hash_url;

priority_queue<info> pq;



void find_largeTH(unordered_map<string,int> urlhash)
{

	unordered_map<string,int>::iterator iter = urlhash.begin();
	info temp;
	for (; iter!= urlhash.end();++iter)
	{
		temp.url = iter->first;
		temp.cnt = iter->second;
		pq.push(temp);
	}

	for (int i = 0; i != 100; ++i) 
	{

		cout<<pq.top().url<<endl;
		cout<<pq.top().cnt<<endl;
		pq.pop();
	}
}



void insertUrl(string url)
{

	pair<unordered_map<string ,int>::iterator, bool> Insert_Pair;
	Insert_Pair = hash_url.insert(unordered_map<string, int>::value_type(url,1));

	if (Insert_Pair.second == false)
	{
		(Insert_Pair.first->second++);
	}

}

int _tmain(int argc, _TCHAR* argv[])
{
	fstream URLfile;
	char buffer[1024]; 
	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())  
	{ cout << "Error opening file"; exit (1); } 
	else
	{
		cout<<"open file success!"<<endl;
	}

	ComputeTime cp;
	cp.Begin();
	int i = 0;
	while (!URLfile.eof())  
	{  
		URLfile.getline (buffer,1024);  
		//cout << buffer << endl;  
		string temp(buffer);
		//cout<<i++<<endl;
		insertUrl(temp);
	}  

	find_largeTH(hash_url);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();
	//system("pause");
	return 0;
}

基本上算是算法裏面比較優秀的解決方式了，面試官假設能聽到這個方法應該會比較欣喜。

方法4：實驗耗時未知，技術上碾壓了上述解決方式，中高年輕人。不要反覆造輪子。哈哈

數據庫，SQL語句：

load data infile "d:/bigdata.txt" into table tb_url(url);

SELECT
	url,
	count(url) as show_count
	FROM
	tb_url
	GROUP BY url
	ORDER BY show_count desc
	LIMIT 100