從今天開始寫larbin的源碼分析。但願在一個月後,本身可以對larbin的源碼瞭如指掌,也但願到那時本身能夠寫出一個爬蟲。web
下面是global的結構,從global中的成員,咱們能夠大概對larbin有個瞭解。dom
struct global {
/** Constructor : see global.cc for details */
global (int argc, char * argv[]);
/** Destructor : never used */
~global ();
/** current time : avoid to many calls to time(NULL) */
static time_t now; //當前的時間
/** List of pages allready seen (one bit per page) */
static hashTable *seen; //已經掃描過的頁
#ifdef NO_DUP
/** Hashtable for suppressing duplicates */
static hashDup *hDuplicate;
#endif // NO_DUP
/** URLs for the sequencer with high priority */
static SyncFifo<url> *URLsPriority; //SyncFifo 是一個同步的先進先出的隊列.點此可進入此結構的研究socket
static SyncFifo<url> *URLsPriorityWait;
static uint readPriorityWait;
/** This one has a lower priority : see fetch/sequencer.cc */
static PersistentFifo *URLsDisk; //PersisitentFifo是一個存儲在硬盤上的隊列。點此進入此結構
static PersistentFifo *URLsDiskWait;
static uint readWait;
/** hashtables of the site we accessed (cache) */
static NamedSite *namedSiteList; //NamedSite是存儲已經訪問過的網站的,每個NamedSite都對應一個IPsite。點此進入NamedSite的研究。 ide
static IPSite *IPSiteList; //點此進入ipsite的研究
/** Sites which have at least one url to fetch */
static Fifo<IPSite> *okSites; // Fifo是非同步的標準的存在於RAM中的隊列。點此進入Fifo的研究
/** Sites which have at least one url to fetch
* but need a dns call
*/
static Fifo<NamedSite> *dnsSites;
/** Informations for the fetch
* This array contain all the connections (empty or not)
*/
static Connexion *connexions;
/** Internal state of adns */
static adns_state ads;
/* Number of pending dns calls */
static uint nbDnsCalls;
/** free connection for fetchOpen : connections with state==EMPTY */
static ConstantSizedFifo<Connexion> *freeConns;
#ifdef THREAD_OUTPUT
/** free connection for fetchOpen : connections waiting for end user */
static ConstantSizedFifo<Connexion> *userConns;
#endif
/** Sum of the sizes of a fifo in Sites */
static Interval *inter;
/** How deep should we go inside a site */
static int8_t depthInSite;
/** Follow external links ? */
static bool externalLinks;
/** how many seconds should we wait beetween 2 calls at the same server
* 0 if you are only on a personnal server, >=30 otherwise
*/
static time_t waitDuration;
/** Name of the bot */
static char *userAgent;
/** Name of the man who lauch the bot */
static char *sender;
/** http headers to send with requests
* sends name of the robots, from field...
*/
static char *headers;
static char *headersRobots; // used when asking a robots.txt
/* internet address of the proxy (if any) */
static sockaddr_in *proxyAddr;
/** connect to this server through a proxy using connection conn
* return >0 in case of success (connecting or connected), 0 otherwise
*/
static char getProxyFds (Connexion *conn);
/** Limit to domain */
static Vector<char> *domains;
/** forbidden extensions
* extensions which are allways to avoid : .ps, .pdf...
*/
static Vector<char> forbExt;
/** number of parallel connexions
* your kernel must support a little more than nb_conn file descriptors
*/
static uint nb_conn;
/** number of parallel dns calls */
static uint dnsConn;
/** number of urls in IPSites */
static int IPUrl;
/** port on which is launched the http statistic webserver */
static unsigned short int httpPort;
/** port on which input wait for queries */
static unsigned short int inputPort;
/** parse configuration file */
static void parseFile (char *file);
/** read the domain limit */
static void manageDomain (char **posParse);
/** read the forbidden extensions */
static void manageExt (char **posParse);
/////////// POLL ///////////////////////////////////
/** array used by poll */
static struct pollfd *pollfds;
/** pos of the max used field in pollfds */
static uint posPoll;
/** size of pollfds */
static uint sizePoll;
/** array used for dealing with answers */
static short *ansPoll;
/** number of the biggest file descriptor */
static int maxFds;
/** make sure the new socket is not too big for ansPoll */
static void verifMax (int fd);
#ifdef MAXBANDWIDTH
/** number of bits still allowed during this second */
static long int remainBand;
#endif // MAXBANDWIDTH
};源碼分析