xref: /haiku/src/add-ons/kernel/file_systems/websearchfs/parse_duckduckgo_html.c (revision ed24eb5ff12640d052171c6a7feba37fab8a75d1)
1 /*
2  * Copyright 2004-2008, François Revol, <revol@free.fr>.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 #include <errno.h>
7 #include <sys/param.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <OS.h>
13 #include <KernelExport.h>
14 #include "duckduckgo_request.h"
15 #include "string_utils.h"
16 
17 #define TESTME
18 
19 #ifdef _KERNEL_MODE
20 #define printf dprintf
21 #undef TESTME
22 #endif
23 
24 #define DBG "duckduckgofs: parse_html: "
25 
26 #ifdef TESTME
27 #define BUFSZ (128*1024)
28 int dbgstep = 0;
29 #define PRST printf(DBG "step %d\n", dbgstep++)
30 #else
31 #define PRST {}
32 #endif
33 
34 #define G_BEGIN_URL "<a rel=\"nofollow\" class=\"result__a\" href=\""
35 #define G_END_URL "\">"
36 //#define G_BEGIN_NAME
37 #define G_END_NAME "</a>"
38 #define G_BEGIN_SNIPSET "<a class=\"result__snippet\""
39 #define G_END_SNIPSET "</a>"
40 #define G_BEGIN_CACHESIM " <a class=fl href=\""
41 #define G_END_CACHESIM "\">"
42 
43 int duckduckgo_parse_results(const char *html, size_t htmlsize, long *nextid, struct duckduckgo_result **results)
44 {
45 	struct duckduckgo_result *res = NULL, *nres = NULL, *prev = NULL;
46 	char *p, *q;
47 	char *nextresult = NULL;
48 	long numres = 0;
49 	long maxres = 1000;
50 	//long startid = 0;
51 	int done = 0;
52 	int err = ENOMEM;
53 
54 	if (!html || !results)
55 		return EINVAL;
56 	/* sanity checks */
57 	printf(DBG"sanity check...\n");
58 	PRST;
59 	if (strstr(html, "<!DOCTYPE html PUBLIC") != html) {
60 		return EINVAL;
61 	}
62 	PRST;
63 	p = strstr(html, "DuckDuckGo");
64 	if (!p) return EINVAL;
65 	PRST;
66 	p = strstr(html, "<body");
67 	if (!p) return EINVAL;
68 	PRST;
69 
70 	/*
71 	p = strstr(html, "Search Results<");
72 	if (!p) return EINVAL;
73 	PRST;
74 	*/
75 
76 
77 	printf(DBG"parsing...\n");
78 	do {
79 		char *item;
80 		unsigned long itemlen;
81 		char *tmp;
82 		char *urlp;
83 		int i;
84 #ifdef TESTME
85 		dbgstep = 0;
86 #endif
87 		nres = malloc(sizeof(struct duckduckgo_result));
88 		if (!nres) {
89 			// XXX: cleanup!
90 			goto err0;
91 		}
92 		memset(nres, 0, sizeof(struct duckduckgo_result));
93 		nres->id = (*nextid)++; //- 1;
94 
95 		PRST;
96 		/* find url */
97 		// <p class=g><a href=URL>
98 		if (!p) break;
99 		if (nextresult)
100 			p = nextresult;
101 		else
102 			p = strstr(p, G_BEGIN_URL);
103 		if (!p) break;
104 		PRST;
105 		p+= strlen(G_BEGIN_URL);
106 		nextresult = strstr(p, G_BEGIN_URL);
107 		//printf(DBG"[%ld] found token 1\n", numres);
108 		item = p;
109 		p = strstr(p, G_END_URL);
110 		if (!p) break;
111 		PRST;
112 		p+= strlen(G_END_URL);
113 		//printf(DBG"[%ld] found token 2\n", numres);
114 		itemlen = GR_MAX_URL-1;
115 		urlp = nres->url;
116 		itemlen = MIN(itemlen, p - item - strlen(G_END_URL));
117 		strncpy(urlp, item, itemlen);
118 		urlp[itemlen] = '\0';
119 
120 		/* find name */
121 		item = p;
122 		p = strstr(p, G_END_NAME);
123 		if (!p) break;
124 		PRST;
125 		p+= strlen(G_END_NAME);
126 		//printf(DBG"[%ld] found token 3\n", numres);
127 		itemlen = p - item - strlen(G_END_NAME);
128 		//itemlen = MIN(GR_MAX_NAME-1, itemlen);
129 		itemlen = MIN(GR_MAX_NAME*4-1, itemlen);
130 		q = malloc(itemlen+1);
131 		if (!q)
132 			goto err0;
133 		strncpy(q, item, itemlen);
134 		q[itemlen] = '\0';
135 		/* strip <*b> off */
136 		PRST;
137 		while ((tmp = strstr(q, "<b>")))
138 			strcpy(tmp, tmp + 3);
139 		while ((tmp = strstr(q, "</b>")))
140 			strcpy(tmp, tmp + 4);
141 		/* strip <*em> off */
142 		PRST;
143 		while ((tmp = strstr(q, "<em>")))
144 			strcpy(tmp, tmp + 4);
145 		while ((tmp = strstr(q, "</em>")))
146 			strcpy(tmp, tmp + 5);
147 		/* strip &foo; */
148 		tmp = unentitify_string(q);
149 		free(q);
150 		if (!tmp)
151 			goto err0;
152 		strncpy(nres->name, tmp, GR_MAX_NAME-1);
153 		nres->name[GR_MAX_NAME-1] = '\0';
154 		free(tmp);
155 		PRST;
156 
157 #if 0
158 		/* find snipset */
159 		if (!p) break;
160 		q = strstr(p, G_BEGIN_SNIPSET);
161 		if (q && (!nextresult || (q < nextresult))) {
162 			p = q;
163 			p+= strlen(G_BEGIN_SNIPSET);
164 			//printf(DBG"[%ld] found token 4\n", numres);
165 			item = p;
166 			p = strstr(p, G_END_SNIPSET);
167 			if (!p) break;
168 			p+= strlen(G_END_SNIPSET);
169 			//printf(DBG"[%ld] found token 5\n", numres);
170 			itemlen = p - item - strlen(G_END_SNIPSET);
171 			itemlen = MIN(GR_MAX_URL-1, itemlen);
172 			strncpy(nres->snipset, item, itemlen);
173 			nres->snipset[itemlen] = '\0';
174 			/* strip &foo; */
175 			tmp = unentitify_string(nres->snipset);
176 			if (!tmp)
177 				break;
178 			strncpy(nres->snipset, tmp, GR_MAX_SNIPSET-1);
179 			nres->snipset[GR_MAX_SNIPSET-1] = '\0';
180 			free(tmp);
181 			/* strip <*b> off */
182 			while ((tmp = strstr(nres->snipset, "<b>")))
183 				strcpy(tmp, tmp + 3);
184 			while ((tmp = strstr(nres->snipset, "</b>")))
185 				strcpy(tmp, tmp + 4);
186 			while ((tmp = strstr(nres->snipset, "\r")))
187 				strcpy(tmp, tmp + 1);
188 			while ((tmp = strstr(nres->snipset, "\n")))
189 				*tmp = ' ';
190 		}
191 
192 #endif
193 		/* find cache/similar url */
194 		for (i = 0; i < 2; i++) {
195 			if (!p) break;
196 			q = strstr(p, G_BEGIN_CACHESIM);
197 			if (q && nextresult && (q > nextresult)) {
198 				p = q;
199 				printf(DBG"[%ld] cache/sim beyond next\n", numres);
200 				p = nextresult; /* reset */
201 			} else if (q && (!nextresult || (q < nextresult))) {
202 				//int iscache;
203 				p = q;
204 				p+= strlen(G_BEGIN_CACHESIM);
205 				//printf(DBG"[%ld] found token 6\n", numres);
206 				item = p;
207 				p = strstr(p, G_END_CACHESIM);
208 				if (!p) break;
209 				p+= strlen(G_END_CACHESIM);
210 				//printf(DBG"[%ld] found token 7\n", numres);
211 				itemlen = p - item - strlen(G_END_CACHESIM);
212 				itemlen = MIN(GR_MAX_URL-1, itemlen);
213 				if (!strncmp(p, "Cached", 6)) {
214 					strncpy(nres->cache_url, item, itemlen);
215 					nres->cache_url[itemlen] = '\0';
216 				} else if (!strncmp(p, "Similar", 7)) {
217 					strncpy(nres->similar_url, item, itemlen);
218 					nres->similar_url[itemlen] = '\0';
219 				}
220 //				 else
221 //					break;
222 			}
223 		}
224 
225 		numres++;
226 		if (!prev)
227 			res = nres;
228 		else
229 			prev->next = nres;
230 		prev = nres;
231 		nres = NULL;
232 	} while (!done || numres < maxres);
233 	*results = res;
234 	return numres;
235 err0:
236 	free(nres);
237 	while (res) {
238 		nres = res->next;
239 		free(res);
240 		res = nres;
241 	}
242 	return err;
243 }
244 
245 #ifdef TESTME
246 int main(int argc, char **argv)
247 {
248 	struct duckduckgo_result *results;
249 	struct duckduckgo_result *tag1 = (void*)0xaaaa5555, *res = NULL, *tag2 = (void*)0x5555aaaa;
250 	size_t len;
251 	char *p;
252 	int err;
253 	long nextid = 0;
254 
255 	p = malloc(BUFSZ+8);
256 	len = read(0, p+4, BUFSZ);
257 	p[BUFSZ+4-1] = '\0';
258 	*(uint32 *)p = 0xa5a5a5a5;
259 	*(uint32 *)(&p[BUFSZ+4]) = 0x5a5a5a5a;
260 	err = duckduckgo_parse_results(p+4, len, &nextid, &results);
261 	printf("error 0x%08x\n", err);
262 	if (err < 0)
263 		return 1;
264 	res = results;
265 	while (res) {
266 		printf("[%ld]:\nURL='%s'\nNAME='%s'\nSNIPSET='%s'\nCACHE='%s'\nSIMILAR='%s'\n\n", res->id, res->url, res->name, res->snipset, res->cache_url, res->similar_url);
267 		res = res->next;
268 	}
269 	printf("before = 0x%08x:0x%08x, after = 0x%08x:0x%08x\n", 0xa5a5a5a5, *(uint32 *)p, 0x5a5a5a5a, *(uint32 *)(&p[BUFSZ+4]));
270 	printf("before = 0x%08x:%p, after = 0x%08x:%p\n", 0xaaaa5555, tag1, 0x5555aaaa, tag2);
271 	return 0;
272 }
273 #endif
274