xml.zig (10654B)
1 const std = @import("std"); 2 const mem = std.mem; 3 4 pub const Parser = struct { 5 document: []const u8, 6 current_tag: []const u8 = undefined, 7 char_buffer: [4]u8 = undefined, 8 mode: enum { normal, attrs, chars, entity } = .normal, 9 10 pub fn init(document: []const u8) Parser { 11 return Parser{ 12 .document = document, 13 }; 14 } 15 16 pub fn next(p: *Parser) ?Event { 17 return switch (p.mode) { 18 .normal => p.nextNormal(), 19 .attrs => p.nextAttrs(), 20 .chars => p.nextChars(), 21 .entity => p.nextEntity(), 22 }; 23 } 24 25 fn nextNormal(p: *Parser) ?Event { 26 p.skipWhitespace(); 27 switch (p.peek(0) orelse return null) { 28 '<' => switch (p.peek(1) orelse return null) { 29 '?' => { 30 if (mem.indexOf(u8, p.document[2..], "?>")) |end| { 31 const ev = Event{ .processing_instruction = p.document[2 .. end + 2] }; 32 p.document = p.document[end + 4 ..]; 33 return ev; 34 } 35 }, 36 '/' => switch (p.peek(2) orelse return null) { 37 ':', 'A'...'Z', '_', 'a'...'z' => { 38 if (mem.indexOfScalar(u8, p.document[3..], '>')) |end| { 39 const ev = Event{ .close_tag = p.document[2 .. end + 3] }; 40 p.document = p.document[end + 4 ..]; 41 return ev; 42 } 43 }, 44 else => {}, 45 }, 46 '!' => switch (p.peek(2) orelse return null) { 47 '-' => if ((p.peek(3) orelse return null) == '-') { 48 if (mem.indexOf(u8, p.document[3..], "-->")) |end| { 49 const ev = Event{ .comment = p.document[4 .. end + 3] }; 50 p.document = p.document[end + 6 ..]; 51 return ev; 52 } 53 }, 54 '[' => if (mem.startsWith(u8, p.document[3..], "CDATA[")) { 55 if (mem.indexOf(u8, p.document, "]]>")) |end| { 56 const ev = Event{ .character_data = p.document[9..end] }; 57 p.document = p.document[end + 3 ..]; 58 return ev; 59 } 60 }, 61 else => {}, 62 }, 63 ':', 'A'...'Z', '_', 'a'...'z' => { 64 const angle = mem.indexOfScalar(u8, p.document, '>') orelse return null; 65 if (mem.indexOfScalar(u8, p.document[0..angle], ' ')) |space| { 66 const ev = Event{ .open_tag = p.document[1..space] }; 67 p.current_tag = ev.open_tag; 68 p.document = p.document[space..]; 69 p.mode = .attrs; 70 return ev; 71 } 72 if (mem.indexOfScalar(u8, p.document[0..angle], '/')) |slash| { 73 const ev = Event{ .open_tag = p.document[1..slash] }; 74 p.current_tag = ev.open_tag; 75 p.document = p.document[slash..]; 76 p.mode = .attrs; 77 return ev; 78 } 79 const ev = Event{ .open_tag = p.document[1..angle] }; 80 p.current_tag = ev.open_tag; 81 p.document = p.document[angle..]; 82 p.mode = .attrs; 83 return ev; 84 }, 85 else => {}, 86 }, 87 else => { 88 p.mode = .chars; 89 return p.nextChars(); 90 }, 91 } 92 return null; 93 } 94 95 fn nextAttrs(p: *Parser) ?Event { 96 p.skipWhitespace(); 97 switch (p.peek(0) orelse return null) { 98 '>' => { 99 p.document = p.document[1..]; 100 p.mode = .normal; 101 return p.nextNormal(); 102 }, 103 '/' => { 104 const ev = Event{ .close_tag = p.current_tag }; 105 if ((p.peek(1) orelse return null) != '>') 106 return null; 107 p.document = p.document[2..]; 108 p.mode = .normal; 109 return ev; 110 }, 111 else => {}, 112 } 113 114 var i: usize = 0; 115 while (isNameChar(p.peek(i) orelse return null)) : (i += 1) {} 116 const name = p.document[0..i]; 117 118 p.document = p.document[i..]; 119 p.skipWhitespace(); 120 121 if ((p.peek(0) orelse return null) != '=') 122 return null; 123 124 p.document = p.document[1..]; 125 p.skipWhitespace(); 126 127 const c = p.peek(0) orelse return null; 128 switch (c) { 129 '\'', '"' => { 130 if (mem.indexOfScalar(u8, p.document[1..], c)) |end| { 131 const ev = Event{ 132 .attribute = .{ 133 .name = name, 134 .raw_value = p.document[1 .. end + 1], 135 }, 136 }; 137 p.document = p.document[end + 2 ..]; 138 return ev; 139 } 140 }, 141 else => {}, 142 } 143 144 return null; 145 } 146 147 fn nextChars(p: *Parser) ?Event { 148 var i: usize = 0; 149 while (true) : (i += 1) { 150 const c = p.peek(i) orelse return null; 151 if (c == '<' or c == '&') { 152 const ev = Event{ .character_data = p.document[0..i] }; 153 p.document = p.document[i..]; 154 p.mode = switch (c) { 155 '<' => .normal, 156 '&' => .entity, 157 else => unreachable, 158 }; 159 return ev; 160 } 161 switch (c) { 162 '<', '&' => {}, 163 else => {}, 164 } 165 } 166 return null; 167 } 168 169 fn parseEntity(s: []const u8, buf: *[4]u8) ?usize { 170 const semi = mem.indexOfScalar(u8, s, ';') orelse return null; 171 const entity = s[0..semi]; 172 if (mem.eql(u8, entity, "lt")) { 173 buf.* = mem.toBytes(@as(u32, '<')); 174 } else if (mem.eql(u8, entity, "gt")) { 175 buf.* = mem.toBytes(@as(u32, '>')); 176 } else if (mem.eql(u8, entity, "amp")) { 177 buf.* = mem.toBytes(@as(u32, '&')); 178 } else if (mem.eql(u8, entity, "apos")) { 179 buf.* = mem.toBytes(@as(u32, '\'')); 180 } else if (mem.eql(u8, entity, "quot")) { 181 buf.* = mem.toBytes(@as(u32, '"')); 182 } else if (mem.startsWith(u8, entity, "#x")) { 183 const codepoint = std.fmt.parseInt(u21, entity[2..semi], 16) catch return null; 184 buf.* = mem.toBytes(@as(u32, codepoint)); 185 } else if (mem.startsWith(u8, entity, "#")) { 186 const codepoint = std.fmt.parseInt(u21, entity[1..semi], 10) catch return null; 187 buf.* = mem.toBytes(@as(u32, codepoint)); 188 } else { 189 return null; 190 } 191 return semi; 192 } 193 194 fn nextEntity(p: *Parser) ?Event { 195 if ((p.peek(0) orelse return null) != '&') 196 return null; 197 198 if (parseEntity(p.document[1..], &p.char_buffer)) |semi| { 199 const codepoint = mem.bytesToValue(u32, &p.char_buffer); 200 const n = std.unicode.utf8Encode(@intCast(codepoint), &p.char_buffer) catch return null; 201 p.document = p.document[semi + 2 ..]; 202 p.mode = .chars; 203 return Event{ .character_data = p.char_buffer[0..n] }; 204 } 205 206 return null; 207 } 208 209 fn isNameChar(c: u8) bool { 210 return switch (c) { 211 ':', 'A'...'Z', '_', 'a'...'z', '-', '.', '0'...'9' => true, 212 else => false, 213 }; 214 } 215 216 fn skipWhitespace(p: *Parser) void { 217 while (true) { 218 switch (p.peek(0) orelse return) { 219 ' ', '\t', '\n', '\r' => { 220 p.document = p.document[1..]; 221 }, 222 else => { 223 return; 224 }, 225 } 226 } 227 } 228 229 fn peek(p: *Parser, n: usize) ?u8 { 230 if (p.document.len <= n) 231 return null; 232 233 return p.document[n]; 234 } 235 }; 236 237 pub const Event = union(enum) { 238 open_tag: []const u8, 239 close_tag: []const u8, 240 attribute: Attribute, 241 comment: []const u8, 242 processing_instruction: []const u8, 243 character_data: []const u8, 244 }; 245 246 pub const Attribute = struct { 247 name: []const u8, 248 raw_value: []const u8, 249 char_buffer: [4]u8 = undefined, 250 251 pub fn dupeValue(attr: Attribute, allocator: mem.Allocator) error{OutOfMemory}![]u8 { 252 var list = std.ArrayList(u8).init(allocator); 253 errdefer list.deinit(); 254 var attr_copy = attr; 255 while (attr_copy.next()) |fragment| 256 try list.appendSlice(fragment); 257 return list.toOwnedSlice(); 258 } 259 260 pub fn valueStartsWith(attr: Attribute, prefix: []const u8) bool { 261 var attr_copy = attr; 262 var i: usize = 0; 263 264 while (attr_copy.next()) |fragment| { 265 if (mem.startsWith(u8, fragment, prefix[i..])) { 266 i += fragment.len; 267 } else { 268 return false; 269 } 270 } 271 272 return i > prefix.len; 273 } 274 275 pub fn valueEql(attr: Attribute, value: []const u8) bool { 276 var attr_copy = attr; 277 var i: usize = 0; 278 279 while (attr_copy.next()) |fragment| { 280 if (mem.startsWith(u8, value[i..], fragment)) { 281 i += fragment.len; 282 } else { 283 return false; 284 } 285 } 286 287 return i == value.len; 288 } 289 290 pub fn next(attr: *Attribute) ?[]const u8 { 291 if (attr.raw_value.len == 0) 292 return null; 293 294 if (attr.raw_value[0] == '&') { 295 if (Parser.parseEntity(attr.raw_value[1..], &attr.char_buffer)) |semi| { 296 const codepoint = mem.bytesToValue(u32, &attr.char_buffer); 297 const n = std.unicode.utf8Encode(@intCast(codepoint), &attr.char_buffer) catch return null; 298 attr.raw_value = attr.raw_value[semi + 2 ..]; 299 return attr.char_buffer[0..n]; 300 } else { 301 return null; 302 } 303 } 304 305 var i: usize = 0; 306 while (true) : (i += 1) { 307 if (attr.raw_value.len == i or attr.raw_value[i] == '&') { 308 const ret = attr.raw_value[0..i]; 309 attr.raw_value = attr.raw_value[i..]; 310 return ret; 311 } 312 } 313 } 314 };