1 /* 2 * Copyright 2004-2008, François Revol, <revol@free.fr>. 3 * Distributed under the terms of the MIT License. 4 */ 5 6 #include <ctype.h> 7 #include <malloc.h> 8 #include <string.h> 9 #include "string_utils.h" 10 11 //#define TESTME 12 13 #ifdef _KERNEL_MODE 14 #define printf dprintf 15 #undef TESTME 16 #endif 17 18 19 20 char *urlify_string(const char *str) 21 { 22 char *dst, *d; 23 const char *p; 24 const char *allowed = "abcdefghijklmnopqrstuvwxyz" \ 25 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ 26 "0123456789" \ 27 "-_.!~*'()"; /* cf. RFC 2396 */ 28 const char *hex = "0123456789ABCDEF"; 29 if (!str) 30 return NULL; 31 // hacky, but safe 32 dst = malloc(strlen(str)*3); 33 if (!dst) 34 return NULL; 35 for (p = str, d = dst; *p; p++) { 36 if (strchr(allowed, *p)) 37 *d++ = *p; 38 else if (*p == ' ') { 39 *d++ = '+'; 40 } else { 41 /* use hex value */ 42 *d++ = '%'; 43 *d++ = hex[(*(unsigned char *)p >> 4) & 0x0F]; 44 *d++ = hex[(*(unsigned char *)p) & 0x0F]; 45 } 46 } 47 *d = '\0'; 48 return dst; 49 } 50 51 // cf. http://www.htmlhelp.com/reference/html40/entities/ 52 53 static const char *entities_tab[][2] = { 54 { "lt", "<" }, 55 { "gt", ">" }, 56 { "amp", "&" }, 57 { "nbsp", " " }, 58 { "quot", "\"" }, 59 { "raquo", "»" }, 60 //{ "laquo", "" }, 61 { "ccedil", "ç" }, 62 // grave 63 { "agrave", "à" }, 64 { "egrave", "è" }, 65 // acute 66 //{ "aacute", "" }, 67 { "eacute", "é" }, 68 // circ 69 { "acirc", "â" }, 70 { "ecirc", "ê" }, 71 { "icirc", "î" }, 72 { "ocirc", "ô" }, 73 { "ucirc", "û" }, 74 { "copy", "©" }, 75 { "trade", "™" }, 76 //{ "", "" }, 77 { NULL, NULL }, 78 }; 79 80 char *unentitify_string(const char *str) 81 { 82 char *dst, *d; 83 const char *p; 84 const char *hex = "0123456789abcdef"; 85 int i; 86 if (!str) 87 return NULL; 88 // hacky, but safe 89 dst = malloc(strlen(str)+2); 90 if (!dst) 91 return NULL; 92 for (p = str, d = dst; *p; p++) { 93 if (*p != '&') 94 *d++ = *p; 95 /* those case convert to binary, but won't check for valid multibyte UTF-8 sequences */ 96 else if ((p[1] == '#') && p[2] && p[3] && (p[4] == ';') && 97 isdigit(p[2]) && 98 isdigit(p[3])) { 99 /* &#nn; */ 100 char c = ((p[2]) - '0') * 10 + 101 ((p[3]) - '0'); 102 *d++ = c; 103 p += 4; 104 } else if ((p[1] == '#') && p[2] && p[3] && p[4] && (p[5] == ';') && 105 isdigit(p[2]) && 106 isdigit(p[3]) && 107 isdigit(p[4])) { 108 /* &#nnn; */ 109 char c = ((p[2]) - '0') * 100 + 110 ((p[3]) - '0') * 10 + 111 ((p[4]) - '0'); 112 *d++ = c; 113 p += 5; 114 } else if ((p[1] == '#') && (p[2] == 'x') && p[3] && p[4] && (p[5] == ';') && 115 strchr(hex, tolower(p[3])) && 116 strchr(hex, tolower(p[4]))) { 117 /* &#xnn; */ 118 char c = (strchr(hex, tolower(p[3])) - hex) << 4 | 119 (strchr(hex, tolower(p[4])) - hex); 120 *d++ = c; 121 p += 5; 122 } else { 123 char buf[20]; 124 strncpy(buf, p+1, 20); 125 buf[19] = '\0'; 126 if (!strchr(buf, ';')) { 127 *d++ = *p; 128 continue; 129 } 130 *(strchr(buf, ';')) = '\0'; 131 for (i = 0; entities_tab[i][0]; i++) { 132 if (!strcmp(buf, entities_tab[i][0])) { 133 strcpy(d, entities_tab[i][1]); 134 d += strlen(d); 135 p += strlen(entities_tab[i][0]) + 1; 136 break; 137 } 138 } 139 if (!entities_tab[i][0]) /* not found */ 140 *d++ = '&'; 141 } 142 } 143 *d = '\0'; 144 return dst; 145 } 146 147 #ifdef TESTME 148 int main(int argc, char **argv) 149 { 150 char *p; 151 if (argc < 2) 152 return 1; 153 p = unentitify_string(argv[1]); 154 printf("'%s'\n", p); 155 free(p); 156 free(malloc(10)); 157 return 0; 158 } 159 #endif 160