<> 第三章原子

時間 2019-11-07

標籤第三原子简体版

原文原文鏈接

3.1 shell

使用2048做爲hash桶的數目: 鏈表最長的有5個元素，最少的1個元素，
絕大部分是在1-3個元素之間, 耗費的時間是[totaltime = 0.006123s]；
使用素數2039做爲hash桶的數目，幾乎沒有出如今某個鏈表有長達5個元
素的狀況，絕大部分是2-3個元素，可見分佈更平均，而且耗時更少了
[totaltime = 0.000001s].不過，我後來再次測試了，發現即便使用
2048做爲hash桶的數目，耗時也是[totaltime = 0.000001s]。不過
hash表分佈的更平均倒是不爭的事實。最後，只有x86的機器，因此沒法
判斷機器的關聯性。即便有不一樣機器我也尚不知道從何處判斷關聯性的大小。

測算時間使用了gettimeofday函數, 測試hash表的分佈狀況, 使用自寫

的Atom_read函數遍歷hash表, 而後寫入文件, 沒具體統計數字，肉眼觀察了下。數據結構

主程序3_1.c 函數

#include <sys/time.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "assert.h"

#include "atom.h"
#include "getOneword.h"

int main(int argc, char const *argv[])
{
    char buf[1024];
    int size;
    FILE *fp;
    int num;

    struct timeval tv_start, tv_end;
    double timeuse, totaltime = 0;

    size = sizeof(buf);
    fp = fopen("test_3_1.txt", "rb");
    assert(fp);

    for (num = 0; num < 10000; ++num) {
        if (getOneword(fp, buf, size, first, rest) == 0)
            break;
        gettimeofday(&tv_start, NULL);
        Atom_new(buf, strlen(buf));
        gettimeofday(&tv_end, NULL);
        timeuse = (tv_end.tv_sec - tv_start.tv_sec) * 1000000 + 
                (tv_end.tv_usec - tv_start.tv_usec);
        timeuse /= 1000000;
        totaltime += timeuse;
    }

    printf("[totaltime = %fs]\n", totaltime);

    fclose(fp);

    Atom_read();

    return 0;
}

其中getOneword函數使用的是書中提供的函數，在getOneword.c中

#include <stdio.h>
#include <ctype.h>

int getOneword(FILE *fp, char *buf, int size, int first(int c), int rest(int c))
{
    int i = 0;
    int c;

    c = getc(fp);

    for ( ; c != EOF; c = getc(fp))
        if (first(c)) {
            if (i < size - 1)
                buf[i++] = c;
            c = getc(fp);
            break;
        }

    for ( ; c != EOF && rest(c); c = getc(fp))
        if (i < size - 1)
            buf[i++] = c;
    
    if (i < size)
        buf[i] = '\0';
    else
        buf[size - 1] = '\0';

    if (c != EOF)
        ungetc(c, fp);

    return i > 0;
}

int first(int c) {
    return isalpha(c);
}
int rest(int c) {
    return isalpha(c) || c == '_';
}

其中Atom_read函數以下：

int Atom_read()
{
	int i;
	struct atom *p;
	FILE *fp;

	fp = fopen("result.txt", "wb");
	assert(fp);

	for (i = 0; i < NELEMS(buckets); i++)
		if (buckets[i] != NULL) {
			for (p = buckets[i]; p; p = p->link)
				fprintf(fp, "buckets[%d]: [%s]\n", i, p->str);
			fprintf(fp, "\n\n\n");
		}

	fclose(fp);

	return i > 0;
}

3.2
對hash的理論瞭解不多，只關注了實際使用。若是想尋找多種多樣的hash函數，
業界著名的可參考下面的URL:
http://www.partow.net/programming/hashfunctions/index.html
在Libevent中，它的hash函數很簡單，只是將一個struct event 的地址單純的
右移6位獲得hash碼。可見，仍是應該根據自身的須要編寫，不過做爲我來講，
通常會選擇高德納的。

3.3
使用strncmp有個顯著缺陷，那就是遇到'\0'就會終止比較。好比有2個字符序列:
str1="abcd\0g" 和 str2="abcd\0m", 那麼使用strncmp函數就會就會認爲str1和
str2相等，這不可接受。書中使用逐個字符比較，固然也能夠使用memcmp進行比較。
但這裏對於書中的寫法，仍然有個現實的考量，就是它會在拷貝完畢字符串以後顯
式的綴上'\0', 做者明顯是爲了取字符串方便, 不過這由應用程序調用時，是會產
生二義性的。

3.4
本題不明白。聽說這種 char str[1] 的寫法很hack. 測試

3.5
寫上hash碼對於特定操做是有明顯的好處: 當要擴展這個hash表時，不須要
再作一次hash了，直接將每一個節點的hash碼模上新的hash桶的數目就獲得了
在新hash表中所在的鏈表。這樣作不只節省了時間，還帶來了一個額外的好
處，就是在舊錶中同一個表項的節點，在新表中仍然在一塊兒，由於它們再舊
表中的hash碼是相同的，模上新的hash桶固然也相同了。這樣在搜索表項時，
舊有代碼照用不誤。

3.6
Atom_length函數之因此會慢，根本緣由在於，最壞狀況下它要遍歷全部存在
於hash表中的節點。一個簡單的解決方法是，稍微複雜化原子的數據結構，
即添加使用上述的hash碼。有了hash碼就能夠直接定位到特定的某個鏈表。
那麼對atom的改造變爲 atom

static struct atom {
    struct atom *link;
    unsigned long atom_hash;
    int len;
    char *str;
} *buckets[2048];

而且也要改造Atom_new函數，它不能再返回p->str, 由於這不夠用了，
它要返回p這個atom結構體指針。

3.7
extern void Atom_init(int hint); 實現這個函數的現實意義暫時還沒發現。 spa

3.8
代碼見下面，其中四個字符串是在作 3.1 題時找到的具備相同hash碼的兩個字符串:
3_8.c .net

#include <sys/time.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "assert.h"

#include "atom.h"

int main(int argc, char const *argv[])
{
    char *s1 = "rebuttal";
    char *s2 = "handler";
    char *s3 = "hpl";
    char *s4 = "Zero";

    const char *p1 = Atom_new(s1, strlen(s1));
    const char *p2 = Atom_new(s2, strlen(s2));
    const char *p3 = Atom_new(s3, strlen(s3));
    const char *p4 = Atom_new(s4, strlen(s4));

    Atom_list();

    Atom_free(p1);
    Atom_free(p3);

    // Atom_reset();
    
    Atom_list();

    return 0;
}

其中 Atom_list Atom_free Atom_reset 函數見下面：

void Atom_free(const char *str)
{
	struct atom *p, *prev;
	int i;
	assert(str);

	for (i = 0; i < NELEMS(buckets); i++)
		for (p = buckets[i]; p; p = p->link) {	
			if (p->str == str) {
				printf("to free: [%s]\n", str);
				if (p == buckets[i])
					buckets[i] = p->link;
				else
					prev->link = p->link;
				free(p);
				// p = NULL;
			}
			prev = p;
		}
}

void Atom_list()
{
	struct atom *p;
	int i;
	for (i = 0; i < NELEMS(buckets); i++)
		for (p = buckets[i]; p; p = p->link)
			printf("%d: [%s]\n", i, p->str);
}

void Atom_reset(void)
{
	struct atom *p;
	int i;

	for (i = 0; i < NELEMS(buckets); i++)
		for (p = buckets[i]; p;) {
			buckets[i] = p->link;
			free(p);
			p = buckets[i];
		}
}

3.9
見下述代碼，主要使用了可變參數列表。
3_9.c 指針

#include <sys/time.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "assert.h"

#include "atom.h"

int main(int argc, char const *argv[])
{
    char *s1 = "rebuttal";
    char *s2 = "handler";
    char *s3 = "hpl";
    char *s4 = "Zero";

    printf("Atom_vload:\n");
    Atom_vload(s1, s2, s3, s4, NULL);

    Atom_list();
    Atom_reset();

    printf("\n\nAtom_aload:\n");

    const char *strs[] = {"rebuttal", "handler", "hpl", "Zero", NULL};
    Atom_aload(strs);

    Atom_list();
    Atom_reset();

    return 0;
}

Atom_vload Atom_aload 函數見下面：

void Atom_vload(const char *str, ...)
{
	va_list ap;
	va_start(ap, str);
	for (; str; str = va_arg(ap, const char *))
		Atom_new(str, strlen(str));
	va_end(ap);
}

void Atom_aload(const char *strs[])
{
	int i;
	const char *p;

	for (i = 0, p = strs[i]; p; p = strs[++i])
		Atom_new(p, strlen(p));
}

3.10
檢查 const char *Atom_add(const char *str, int len) 參數中的str不爲 NULL.

最後列出添加了新函數的atom.c文件，其中沒有使用後面內存管理章節的FREE和ALLOC函數，使用的是標準庫函數，目的是編譯方便，如今編譯時只須要： rest

gcc -g 3_1.c atom.c getword.c

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。

<> 第三章 原子

<> 第三章原子