1 /*
2 * Copyright 2004-2008, François Revol, <revol@free.fr>.
3 * Distributed under the terms of the MIT License.
4 */
5
6 #include <ctype.h>
7 #include <malloc.h>
8 #include <string.h>
9 #include "string_utils.h"
10
11 //#define TESTME
12
13 #ifdef _KERNEL_MODE
14 #define printf dprintf
15 #undef TESTME
16 #endif
17
18
19
urlify_string(const char * str)20 char *urlify_string(const char *str)
21 {
22 char *dst, *d;
23 const char *p;
24 const char *allowed = "abcdefghijklmnopqrstuvwxyz" \
25 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
26 "0123456789" \
27 "-_.!~*'()"; /* cf. RFC 2396 */
28 const char *hex = "0123456789ABCDEF";
29 if (!str)
30 return NULL;
31 // hacky, but safe
32 dst = malloc(strlen(str)*3);
33 if (!dst)
34 return NULL;
35 for (p = str, d = dst; *p; p++) {
36 if (strchr(allowed, *p))
37 *d++ = *p;
38 else if (*p == ' ') {
39 *d++ = '+';
40 } else {
41 /* use hex value */
42 *d++ = '%';
43 *d++ = hex[(*(unsigned char *)p >> 4) & 0x0F];
44 *d++ = hex[(*(unsigned char *)p) & 0x0F];
45 }
46 }
47 *d = '\0';
48 return dst;
49 }
50
51 // cf. http://www.htmlhelp.com/reference/html40/entities/
52
53 static const char *entities_tab[][2] = {
54 { "lt", "<" },
55 { "gt", ">" },
56 { "amp", "&" },
57 { "nbsp", " " },
58 { "quot", "\"" },
59 { "raquo", "»" },
60 //{ "laquo", "" },
61 { "ccedil", "ç" },
62 // grave
63 { "agrave", "à" },
64 { "egrave", "è" },
65 // acute
66 //{ "aacute", "" },
67 { "eacute", "é" },
68 // circ
69 { "acirc", "â" },
70 { "ecirc", "ê" },
71 { "icirc", "î" },
72 { "ocirc", "ô" },
73 { "ucirc", "û" },
74 { "copy", "©" },
75 { "trade", "™" },
76 //{ "", "" },
77 { NULL, NULL },
78 };
79
unentitify_string(const char * str)80 char *unentitify_string(const char *str)
81 {
82 char *dst, *d;
83 const char *p;
84 const char *hex = "0123456789abcdef";
85 int i;
86 if (!str)
87 return NULL;
88 // hacky, but safe
89 dst = malloc(strlen(str)+2);
90 if (!dst)
91 return NULL;
92 for (p = str, d = dst; *p; p++) {
93 if (*p != '&')
94 *d++ = *p;
95 /* those case convert to binary, but won't check for valid multibyte UTF-8 sequences */
96 else if ((p[1] == '#') && p[2] && p[3] && (p[4] == ';') &&
97 isdigit(p[2]) &&
98 isdigit(p[3])) {
99 /* &#nn; */
100 char c = ((p[2]) - '0') * 10 +
101 ((p[3]) - '0');
102 *d++ = c;
103 p += 4;
104 } else if ((p[1] == '#') && p[2] && p[3] && p[4] && (p[5] == ';') &&
105 isdigit(p[2]) &&
106 isdigit(p[3]) &&
107 isdigit(p[4])) {
108 /* &#nnn; */
109 char c = ((p[2]) - '0') * 100 +
110 ((p[3]) - '0') * 10 +
111 ((p[4]) - '0');
112 *d++ = c;
113 p += 5;
114 } else if ((p[1] == '#') && (p[2] == 'x') && p[3] && p[4] && (p[5] == ';') &&
115 strchr(hex, tolower(p[3])) &&
116 strchr(hex, tolower(p[4]))) {
117 /* &#xnn; */
118 char c = (strchr(hex, tolower(p[3])) - hex) << 4 |
119 (strchr(hex, tolower(p[4])) - hex);
120 *d++ = c;
121 p += 5;
122 } else {
123 char buf[20];
124 strncpy(buf, p+1, 20);
125 buf[19] = '\0';
126 if (!strchr(buf, ';')) {
127 *d++ = *p;
128 continue;
129 }
130 *(strchr(buf, ';')) = '\0';
131 for (i = 0; entities_tab[i][0]; i++) {
132 if (!strcmp(buf, entities_tab[i][0])) {
133 strcpy(d, entities_tab[i][1]);
134 d += strlen(d);
135 p += strlen(entities_tab[i][0]) + 1;
136 break;
137 }
138 }
139 if (!entities_tab[i][0]) /* not found */
140 *d++ = '&';
141 }
142 }
143 *d = '\0';
144 return dst;
145 }
146
147 #ifdef TESTME
main(int argc,char ** argv)148 int main(int argc, char **argv)
149 {
150 char *p;
151 if (argc < 2)
152 return 1;
153 p = unentitify_string(argv[1]);
154 printf("'%s'\n", p);
155 free(p);
156 free(malloc(10));
157 return 0;
158 }
159 #endif
160