larbin源碼分析(1)--global結構

    從今天開始寫larbin的源碼分析。但願在一個月後,本身可以對larbin的源碼瞭如指掌,也但願到那時本身能夠寫出一個爬蟲。web

 

下面是global的結構,從global中的成員,咱們能夠大概對larbin有個瞭解。dom

  struct global {
  /** Constructor : see global.cc for details */
  global (int argc, char * argv[]);          
  /** Destructor : never used */
  ~global ();
  /** current time : avoid to many calls to time(NULL) */
  static time_t now;                                          //當前的時間
  /** List of pages allready seen (one bit per page) */
  static hashTable *seen;                                  //已經掃描過的頁
#ifdef NO_DUP
  /** Hashtable for suppressing duplicates */
  static hashDup *hDuplicate;
#endif // NO_DUP
  /** URLs for the sequencer with high priority */               
  static SyncFifo<url> *URLsPriority;                       //SyncFifo 是一個同步的先進先出的隊列.點此可進入此結構的研究
socket

   static SyncFifo<url> *URLsPriorityWait;    
  static uint readPriorityWait;
  /** This one has a lower priority : see fetch/sequencer.cc */
  static PersistentFifo *URLsDisk;                          //PersisitentFifo是一個存儲在硬盤上的隊列。點此進入此結構
  static PersistentFifo *URLsDiskWait;
  static uint readWait;
  /** hashtables of the site we accessed (cache) */
  static NamedSite *namedSiteList;                      //NamedSite是存儲已經訪問過的網站的,每個NamedSite都對應一個IPsite。點此進入NamedSite的研究。 ide

  static IPSite *IPSiteList;      //點此進入ipsite的研究
  /** Sites which have at least one url to fetch */
  static Fifo<IPSite> *okSites;         // Fifo是非同步的標準的存在於RAM中的隊列。點此進入Fifo的研究
  /** Sites which have at least one url to fetch
   * but need a dns call
   */
  static Fifo<NamedSite> *dnsSites;
  /** Informations for the fetch
   * This array contain all the connections (empty or not)
   */
  static Connexion *connexions;
  /** Internal state of adns */
  static adns_state ads;
  /* Number of pending dns calls */
  static uint nbDnsCalls;
  /** free connection for fetchOpen : connections with state==EMPTY */
  static ConstantSizedFifo<Connexion> *freeConns;
#ifdef THREAD_OUTPUT
  /** free connection for fetchOpen : connections waiting for end user */
  static ConstantSizedFifo<Connexion> *userConns;
#endif
  /** Sum of the sizes of a fifo in Sites */
  static Interval *inter;
  /** How deep should we go inside a site */
  static int8_t depthInSite;
  /** Follow external links ? */
  static bool externalLinks;
  /** how many seconds should we wait beetween 2 calls at the same server
   * 0 if you are only on a personnal server, >=30 otherwise
   */
  static time_t waitDuration;
  /** Name of the bot */
  static char *userAgent;
  /** Name of the man who lauch the bot */
  static char *sender;
  /** http headers to send with requests
   * sends name of the robots, from field...
   */
  static char *headers;
  static char *headersRobots;  // used when asking a robots.txt
  /* internet address of the proxy (if any) */
  static sockaddr_in *proxyAddr;
  /** connect to this server through a proxy using connection conn
   * return >0 in case of success (connecting or connected), 0 otherwise
   */
  static char getProxyFds (Connexion *conn);
  /** Limit to domain */
  static Vector<char> *domains;
  /** forbidden extensions
   * extensions which are allways to avoid : .ps, .pdf...
   */
  static Vector<char> forbExt;
  /** number of parallel connexions
   * your kernel must support a little more than nb_conn file descriptors
   */
  static uint nb_conn;
  /** number of parallel dns calls */
  static uint dnsConn;
  /** number of urls in IPSites */
  static int IPUrl;
  /** port on which is launched the http statistic webserver */
  static unsigned short int httpPort;
  /** port on which input wait for queries */
  static unsigned short int inputPort;
  /** parse configuration file */
  static void parseFile (char *file);
  /** read the domain limit */
  static void manageDomain (char **posParse);
  /** read the forbidden extensions */
  static void manageExt (char **posParse);
  /////////// POLL ///////////////////////////////////
  /** array used by poll */
  static struct pollfd *pollfds;
  /** pos of the max used field in pollfds */
  static uint posPoll;
  /** size of pollfds */
  static uint sizePoll;
  /** array used for dealing with answers */
  static short *ansPoll;
  /** number of the biggest file descriptor */
  static int maxFds;
  /** make sure the new socket is not too big for ansPoll */
  static void verifMax (int fd);
#ifdef MAXBANDWIDTH
  /** number of bits still allowed during this second */
  static long int remainBand;
#endif // MAXBANDWIDTH
};源碼分析

相關文章
相關標籤/搜索