1 /* 2 * Copyright 2004-2008, François Revol, <revol@free.fr>. 3 * Distributed under the terms of the MIT License. 4 */ 5 6 #include <errno.h> 7 #include <sys/param.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include <unistd.h> 12 #include <OS.h> 13 #include <KernelExport.h> 14 #include "duckduckgo_request.h" 15 #include "string_utils.h" 16 17 #define TESTME 18 19 #ifdef _KERNEL_MODE 20 #define printf dprintf 21 #undef TESTME 22 #endif 23 24 #define DBG "duckduckgofs: parse_html: " 25 26 #ifdef TESTME 27 #define BUFSZ (128*1024) 28 int dbgstep = 0; 29 #define PRST printf(DBG "step %d\n", dbgstep++) 30 #else 31 #define PRST {} 32 #endif 33 34 #define G_BEGIN_URL "<a rel=\"nofollow\" class=\"result__a\" href=\"" 35 #define G_END_URL "\">" 36 //#define G_BEGIN_NAME 37 #define G_END_NAME "</a>" 38 #define G_BEGIN_SNIPSET "<a class=\"result__snippet\"" 39 #define G_END_SNIPSET "</a>" 40 #define G_BEGIN_CACHESIM " <a class=fl href=\"" 41 #define G_END_CACHESIM "\">" 42 43 int duckduckgo_parse_results(const char *html, size_t htmlsize, long *nextid, struct duckduckgo_result **results) 44 { 45 struct duckduckgo_result *res = NULL, *nres = NULL, *prev = NULL; 46 char *p, *q; 47 char *nextresult = NULL; 48 long numres = 0; 49 long maxres = 1000; 50 //long startid = 0; 51 int done = 0; 52 int err = ENOMEM; 53 54 if (!html || !results) 55 return EINVAL; 56 /* sanity checks */ 57 printf(DBG"sanity check...\n"); 58 PRST; 59 if (strstr(html, "<!DOCTYPE html PUBLIC") != html) { 60 return EINVAL; 61 } 62 PRST; 63 p = strstr(html, "DuckDuckGo"); 64 if (!p) return EINVAL; 65 PRST; 66 p = strstr(html, "<body"); 67 if (!p) return EINVAL; 68 PRST; 69 70 /* 71 p = strstr(html, "Search Results<"); 72 if (!p) return EINVAL; 73 PRST; 74 */ 75 76 77 printf(DBG"parsing...\n"); 78 do { 79 char *item; 80 unsigned long itemlen; 81 char *tmp; 82 char *urlp; 83 int i; 84 #ifdef TESTME 85 dbgstep = 0; 86 #endif 87 nres = malloc(sizeof(struct duckduckgo_result)); 88 if (!nres) { 89 // XXX: cleanup! 90 goto err0; 91 } 92 memset(nres, 0, sizeof(struct duckduckgo_result)); 93 nres->id = (*nextid)++; //- 1; 94 95 PRST; 96 /* find url */ 97 // <p class=g><a href=URL> 98 if (!p) break; 99 if (nextresult) 100 p = nextresult; 101 else 102 p = strstr(p, G_BEGIN_URL); 103 if (!p) break; 104 PRST; 105 p+= strlen(G_BEGIN_URL); 106 nextresult = strstr(p, G_BEGIN_URL); 107 //printf(DBG"[%ld] found token 1\n", numres); 108 item = p; 109 p = strstr(p, G_END_URL); 110 if (!p) break; 111 PRST; 112 p+= strlen(G_END_URL); 113 //printf(DBG"[%ld] found token 2\n", numres); 114 itemlen = GR_MAX_URL-1; 115 urlp = nres->url; 116 itemlen = MIN(itemlen, p - item - strlen(G_END_URL)); 117 strncpy(urlp, item, itemlen); 118 urlp[itemlen] = '\0'; 119 120 /* find name */ 121 item = p; 122 p = strstr(p, G_END_NAME); 123 if (!p) break; 124 PRST; 125 p+= strlen(G_END_NAME); 126 //printf(DBG"[%ld] found token 3\n", numres); 127 itemlen = p - item - strlen(G_END_NAME); 128 //itemlen = MIN(GR_MAX_NAME-1, itemlen); 129 itemlen = MIN(GR_MAX_NAME*4-1, itemlen); 130 q = malloc(itemlen+1); 131 if (!q) 132 goto err0; 133 strncpy(q, item, itemlen); 134 q[itemlen] = '\0'; 135 /* strip <*b> off */ 136 PRST; 137 while ((tmp = strstr(q, "<b>"))) 138 strcpy(tmp, tmp + 3); 139 while ((tmp = strstr(q, "</b>"))) 140 strcpy(tmp, tmp + 4); 141 /* strip <*em> off */ 142 PRST; 143 while ((tmp = strstr(q, "<em>"))) 144 strcpy(tmp, tmp + 4); 145 while ((tmp = strstr(q, "</em>"))) 146 strcpy(tmp, tmp + 5); 147 /* strip &foo; */ 148 tmp = unentitify_string(q); 149 free(q); 150 if (!tmp) 151 goto err0; 152 strncpy(nres->name, tmp, GR_MAX_NAME-1); 153 nres->name[GR_MAX_NAME-1] = '\0'; 154 free(tmp); 155 PRST; 156 157 #if 0 158 /* find snipset */ 159 if (!p) break; 160 q = strstr(p, G_BEGIN_SNIPSET); 161 if (q && (!nextresult || (q < nextresult))) { 162 p = q; 163 p+= strlen(G_BEGIN_SNIPSET); 164 //printf(DBG"[%ld] found token 4\n", numres); 165 item = p; 166 p = strstr(p, G_END_SNIPSET); 167 if (!p) break; 168 p+= strlen(G_END_SNIPSET); 169 //printf(DBG"[%ld] found token 5\n", numres); 170 itemlen = p - item - strlen(G_END_SNIPSET); 171 itemlen = MIN(GR_MAX_URL-1, itemlen); 172 strncpy(nres->snipset, item, itemlen); 173 nres->snipset[itemlen] = '\0'; 174 /* strip &foo; */ 175 tmp = unentitify_string(nres->snipset); 176 if (!tmp) 177 break; 178 strncpy(nres->snipset, tmp, GR_MAX_SNIPSET-1); 179 nres->snipset[GR_MAX_SNIPSET-1] = '\0'; 180 free(tmp); 181 /* strip <*b> off */ 182 while ((tmp = strstr(nres->snipset, "<b>"))) 183 strcpy(tmp, tmp + 3); 184 while ((tmp = strstr(nres->snipset, "</b>"))) 185 strcpy(tmp, tmp + 4); 186 while ((tmp = strstr(nres->snipset, "\r"))) 187 strcpy(tmp, tmp + 1); 188 while ((tmp = strstr(nres->snipset, "\n"))) 189 *tmp = ' '; 190 } 191 192 #endif 193 /* find cache/similar url */ 194 for (i = 0; i < 2; i++) { 195 if (!p) break; 196 q = strstr(p, G_BEGIN_CACHESIM); 197 if (q && nextresult && (q > nextresult)) { 198 p = q; 199 printf(DBG"[%ld] cache/sim beyond next\n", numres); 200 p = nextresult; /* reset */ 201 } else if (q && (!nextresult || (q < nextresult))) { 202 //int iscache; 203 p = q; 204 p+= strlen(G_BEGIN_CACHESIM); 205 //printf(DBG"[%ld] found token 6\n", numres); 206 item = p; 207 p = strstr(p, G_END_CACHESIM); 208 if (!p) break; 209 p+= strlen(G_END_CACHESIM); 210 //printf(DBG"[%ld] found token 7\n", numres); 211 itemlen = p - item - strlen(G_END_CACHESIM); 212 itemlen = MIN(GR_MAX_URL-1, itemlen); 213 if (!strncmp(p, "Cached", 6)) { 214 strncpy(nres->cache_url, item, itemlen); 215 nres->cache_url[itemlen] = '\0'; 216 } else if (!strncmp(p, "Similar", 7)) { 217 strncpy(nres->similar_url, item, itemlen); 218 nres->similar_url[itemlen] = '\0'; 219 } 220 // else 221 // break; 222 } 223 } 224 225 numres++; 226 if (!prev) 227 res = nres; 228 else 229 prev->next = nres; 230 prev = nres; 231 nres = NULL; 232 } while (!done || numres < maxres); 233 *results = res; 234 return numres; 235 err0: 236 free(nres); 237 while (res) { 238 nres = res->next; 239 free(res); 240 res = nres; 241 } 242 return err; 243 } 244 245 #ifdef TESTME 246 int main(int argc, char **argv) 247 { 248 struct duckduckgo_result *results; 249 struct duckduckgo_result *tag1 = (void*)0xaaaa5555, *res = NULL, *tag2 = (void*)0x5555aaaa; 250 size_t len; 251 char *p; 252 int err; 253 long nextid = 0; 254 255 p = malloc(BUFSZ+8); 256 len = read(0, p+4, BUFSZ); 257 p[BUFSZ+4-1] = '\0'; 258 *(uint32 *)p = 0xa5a5a5a5; 259 *(uint32 *)(&p[BUFSZ+4]) = 0x5a5a5a5a; 260 err = duckduckgo_parse_results(p+4, len, &nextid, &results); 261 printf("error 0x%08x\n", err); 262 if (err < 0) 263 return 1; 264 res = results; 265 while (res) { 266 printf("[%ld]:\nURL='%s'\nNAME='%s'\nSNIPSET='%s'\nCACHE='%s'\nSIMILAR='%s'\n\n", res->id, res->url, res->name, res->snipset, res->cache_url, res->similar_url); 267 res = res->next; 268 } 269 printf("before = 0x%08x:0x%08x, after = 0x%08x:0x%08x\n", 0xa5a5a5a5, *(uint32 *)p, 0x5a5a5a5a, *(uint32 *)(&p[BUFSZ+4])); 270 printf("before = 0x%08x:%p, after = 0x%08x:%p\n", 0xaaaa5555, tag1, 0x5555aaaa, tag2); 271 return 0; 272 } 273 #endif 274