001 /* 002 * (c) Copyright 2009 University of Bristol 003 * All rights reserved. 004 * [See end of file] 005 */ 006 package net.rootdev.javardfa; 007 008 import net.rootdev.javardfa.uri.URIExtractor10; 009 import net.rootdev.javardfa.uri.URIExtractor; 010 import net.rootdev.javardfa.uri.IRIResolver; 011 import net.rootdev.javardfa.literal.LiteralCollector; 012 import java.util.Collection; 013 import java.util.EnumSet; 014 import java.util.Iterator; 015 import java.util.LinkedList; 016 import java.util.List; 017 import java.util.Set; 018 import javax.xml.namespace.QName; 019 import javax.xml.stream.XMLEventFactory; 020 import javax.xml.stream.XMLOutputFactory; 021 import javax.xml.stream.XMLStreamException; 022 import javax.xml.stream.events.Attribute; 023 import javax.xml.stream.events.StartElement; 024 import javax.xml.stream.events.XMLEvent; 025 import org.xml.sax.Attributes; 026 import org.xml.sax.ContentHandler; 027 import org.xml.sax.Locator; 028 import org.xml.sax.SAXException; 029 030 /** 031 * @author Damian Steer <pldms@mac.com> 032 */ 033 public class Parser implements ContentHandler { 034 035 private final XMLEventFactory eventFactory; 036 private final StatementSink sink; 037 private final Set<Setting> settings; 038 private final LiteralCollector literalCollector; 039 private final URIExtractor extractor; 040 private final ProfileCollector profileCollector; 041 042 public Parser(StatementSink sink) { 043 this( sink, 044 XMLOutputFactory.newInstance(), 045 XMLEventFactory.newInstance(), 046 new URIExtractor10(new IRIResolver()), 047 ProfileCollector.EMPTY_COLLECTOR); 048 } 049 050 public Parser(StatementSink sink, 051 XMLOutputFactory outputFactory, 052 XMLEventFactory eventFactory, 053 URIExtractor extractor, 054 ProfileCollector profileCollector) { 055 this.sink = sink; 056 this.eventFactory = eventFactory; 057 this.settings = EnumSet.noneOf(Setting.class); 058 this.extractor = extractor; 059 this.literalCollector = new LiteralCollector(this, eventFactory, outputFactory); 060 this.profileCollector = profileCollector; 061 062 extractor.setSettings(settings); 063 064 // Important, although I guess the caller doesn't get total control 065 outputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true); 066 } 067 068 public void enable(Setting setting) { 069 settings.add(setting); 070 } 071 072 public void disable(Setting setting) { 073 settings.remove(setting); 074 } 075 076 public void setBase(String base) { 077 this.context = new EvalContext(base); 078 sink.setBase(context.getBase()); 079 } 080 081 EvalContext parse(EvalContext context, StartElement element) 082 throws XMLStreamException { 083 boolean skipElement = false; 084 String newSubject = null; 085 String currentObject = null; 086 List<String> forwardProperties = new LinkedList(); 087 List<String> backwardProperties = new LinkedList(); 088 String currentLanguage = context.language; 089 090 if (settings.contains(Setting.OnePointOne)) { 091 092 if (element.getAttributeByName(Constants.vocab) != null) { 093 context.vocab = 094 element.getAttributeByName(Constants.vocab).getValue().trim(); 095 } 096 097 if (element.getAttributeByName(Constants.prefix) != null) { 098 parsePrefixes(element.getAttributeByName(Constants.prefix).getValue(), context); 099 } 100 101 if (element.getAttributeByName(Constants.profile) != null) { 102 String profileURI = extractor.resolveURI( 103 element.getAttributeByName(Constants.profile).getValue(), 104 context); 105 profileCollector.getProfile( 106 profileURI, 107 context); 108 } 109 } 110 111 // The xml / html namespace matching is a bit ropey. I wonder if the html 5 112 // parser has a setting for this? 113 if (settings.contains(Setting.ManualNamespaces)) { 114 if (element.getAttributeByName(Constants.xmllang) != null) { 115 currentLanguage = element.getAttributeByName(Constants.xmllang).getValue(); 116 if (currentLanguage.length() == 0) currentLanguage = null; 117 } else if (element.getAttributeByName(Constants.lang) != null) { 118 currentLanguage = element.getAttributeByName(Constants.lang).getValue(); 119 if (currentLanguage.length() == 0) currentLanguage = null; 120 } 121 } else if (element.getAttributeByName(Constants.xmllangNS) != null) { 122 currentLanguage = element.getAttributeByName(Constants.xmllangNS).getValue(); 123 if (currentLanguage.length() == 0) currentLanguage = null; 124 } 125 126 if (Constants.base.equals(element.getName()) && 127 element.getAttributeByName(Constants.href) != null) { 128 context.setBase(element.getAttributeByName(Constants.href).getValue()); 129 sink.setBase(context.getBase()); 130 } 131 132 if (element.getAttributeByName(Constants.rev) == null && 133 element.getAttributeByName(Constants.rel) == null) { 134 Attribute nSubj = findAttribute(element, Constants.about, Constants.src, 135 Constants.resource, Constants.href); 136 if (nSubj != null) { 137 newSubject = extractor.getURI(element, nSubj, context); 138 } 139 if (newSubject == null) { 140 if (Constants.body.equals(element.getName()) || 141 Constants.head.equals(element.getName())) { 142 newSubject = context.base; 143 } 144 else if (element.getAttributeByName(Constants.typeof) != null) { 145 newSubject = createBNode(); 146 } else { 147 if (context.parentObject != null) { 148 newSubject = context.parentObject; 149 } 150 if (element.getAttributeByName(Constants.property) == null) { 151 skipElement = true; 152 } 153 } 154 } 155 } else { 156 Attribute nSubj = findAttribute(element, Constants.about, Constants.src); 157 if (nSubj != null) { 158 newSubject = extractor.getURI(element, nSubj, context); 159 } 160 if (newSubject == null) { 161 // if element is head or body assume about="" 162 if (Constants.head.equals(element.getName()) || 163 Constants.body.equals(element.getName())) { 164 newSubject = context.base; 165 } else if (element.getAttributeByName(Constants.typeof) != null) { 166 newSubject = createBNode(); 167 } else if (context.parentObject != null) { 168 newSubject = context.parentObject; 169 } 170 } 171 Attribute cObj = findAttribute(element, Constants.resource, Constants.href); 172 if (cObj != null) { 173 currentObject = extractor.getURI(element, cObj, context); 174 } 175 } 176 177 if (newSubject != null && element.getAttributeByName(Constants.typeof) != null) { 178 List<String> types = extractor.getURIs(element, 179 element.getAttributeByName(Constants.typeof), context); 180 for (String type : types) { 181 emitTriples(newSubject, 182 Constants.rdfType, 183 type); 184 } 185 } 186 187 // Dodgy extension 188 if (settings.contains(Setting.FormMode)) { 189 if (Constants.form.equals(element.getName())) { 190 emitTriples(newSubject, Constants.rdfType, "http://www.w3.org/1999/xhtml/vocab/#form"); // Signal entering form 191 } 192 if (Constants.input.equals(element.getName()) && 193 element.getAttributeByName(Constants.name) != null) { 194 currentObject = "?" + element.getAttributeByName(Constants.name).getValue(); 195 } 196 197 } 198 199 if (currentObject != null) { 200 if (element.getAttributeByName(Constants.rel) != null) { 201 emitTriples(newSubject, 202 extractor.getURIs(element, 203 element.getAttributeByName(Constants.rel), context), 204 currentObject); 205 } 206 if (element.getAttributeByName(Constants.rev) != null) { 207 emitTriples(currentObject, 208 extractor.getURIs(element, element.getAttributeByName(Constants.rev), context), 209 newSubject); 210 } 211 } else { 212 if (element.getAttributeByName(Constants.rel) != null) { 213 forwardProperties.addAll(extractor.getURIs(element, 214 element.getAttributeByName(Constants.rel), context)); 215 } 216 if (element.getAttributeByName(Constants.rev) != null) { 217 backwardProperties.addAll(extractor.getURIs(element, 218 element.getAttributeByName(Constants.rev), context)); 219 } 220 if (!forwardProperties.isEmpty() || !backwardProperties.isEmpty()) { 221 // if predicate present 222 currentObject = createBNode(); 223 } 224 } 225 226 // Getting literal values. Complicated! 227 if (element.getAttributeByName(Constants.property) != null) { 228 List<String> props = extractor.getURIs(element, 229 element.getAttributeByName(Constants.property), context); 230 String dt = getDatatype(element); 231 if (element.getAttributeByName(Constants.content) != null) { // The easy bit 232 String lex = element.getAttributeByName(Constants.content).getValue(); 233 if (dt == null || dt.length() == 0) { 234 emitTriplesPlainLiteral(newSubject, props, lex, currentLanguage); 235 } else { 236 emitTriplesDatatypeLiteral(newSubject, props, lex, dt); 237 } 238 } else { 239 literalCollector.collect(newSubject, props, dt, currentLanguage); 240 } 241 } 242 243 if (!skipElement && newSubject != null) { 244 emitTriples(context.parentSubject, 245 context.forwardProperties, 246 newSubject); 247 248 emitTriples(newSubject, 249 context.backwardProperties, 250 context.parentSubject); 251 } 252 253 EvalContext ec = new EvalContext(context); 254 if (skipElement) { 255 ec.language = currentLanguage; 256 } else { 257 if (newSubject != null) { 258 ec.parentSubject = newSubject; 259 } else { 260 ec.parentSubject = context.parentSubject; 261 } 262 263 if (currentObject != null) { 264 ec.parentObject = currentObject; 265 } else if (newSubject != null) { 266 ec.parentObject = newSubject; 267 } else { 268 ec.parentObject = context.parentSubject; 269 } 270 271 ec.language = currentLanguage; 272 ec.forwardProperties = forwardProperties; 273 ec.backwardProperties = backwardProperties; 274 } 275 return ec; 276 } 277 278 private Attribute findAttribute(StartElement element, QName... names) { 279 for (QName aName : names) { 280 Attribute a = element.getAttributeByName(aName); 281 if (a != null) { 282 return a; 283 } 284 } 285 return null; 286 } 287 288 public void emitTriples(String subj, Collection<String> props, String obj) { 289 for (String prop : props) { 290 sink.addObject(subj, prop, obj); 291 } 292 } 293 294 public void emitTriplesPlainLiteral(String subj, Collection<String> props, String lex, String language) { 295 for (String prop : props) { 296 sink.addLiteral(subj, prop, lex, language, null); 297 } 298 } 299 300 public void emitTriplesDatatypeLiteral(String subj, Collection<String> props, String lex, String datatype) { 301 for (String prop : props) { 302 sink.addLiteral(subj, prop, lex, null, datatype); 303 } 304 } 305 306 int bnodeId = 0; 307 308 private String createBNode() // TODO probably broken? Can you write bnodes in rdfa directly? 309 { 310 return "_:node" + (bnodeId++); 311 } 312 313 private String getDatatype(StartElement element) { 314 Attribute de = element.getAttributeByName(Constants.datatype); 315 if (de == null) { 316 return null; 317 } 318 String dt = de.getValue(); 319 if (dt.length() == 0) { 320 return dt; 321 } 322 return extractor.expandCURIE(element, dt, context); 323 } 324 325 private void getNamespaces(Attributes attrs) { 326 for (int i = 0; i < attrs.getLength(); i++) { 327 String qname = attrs.getQName(i); 328 String prefix = getPrefix(qname); 329 if ("xmlns".equals(prefix)) { 330 String pre = getLocal(prefix, qname); 331 String uri = attrs.getValue(i); 332 if (!settings.contains(Setting.ManualNamespaces) && pre.contains("_")) 333 continue; // not permitted 334 context.setNamespaceURI(pre, uri); 335 sink.addPrefix(pre, uri); 336 } 337 } 338 } 339 340 private String getPrefix(String qname) { 341 if (!qname.contains(":")) { 342 return ""; 343 } 344 return qname.substring(0, qname.indexOf(":")); 345 } 346 347 private String getLocal(String prefix, String qname) { 348 if (prefix.length() == 0) { 349 return qname; 350 } 351 return qname.substring(prefix.length() + 1); 352 } 353 /** 354 * SAX methods 355 */ 356 private Locator locator; 357 private EvalContext context; 358 359 public void setDocumentLocator(Locator arg0) { 360 this.locator = arg0; 361 if (locator.getSystemId() != null) 362 this.setBase(arg0.getSystemId()); 363 } 364 365 public void startDocument() throws SAXException { 366 sink.start(); 367 } 368 369 public void endDocument() throws SAXException { 370 sink.end(); 371 } 372 373 public void startPrefixMapping(String arg0, String arg1) 374 throws SAXException { 375 context.setNamespaceURI(arg0, arg1); 376 sink.addPrefix(arg0, arg1); 377 } 378 379 public void endPrefixMapping(String arg0) throws SAXException { 380 } 381 382 public void startElement(String arg0, String localname, String qname, Attributes arg3) throws SAXException { 383 try { 384 //System.err.println("Start element: " + arg0 + " " + arg1 + " " + arg2); 385 386 // This is set very late in some html5 cases (not even ready by document start) 387 if (context == null) { 388 this.setBase(locator.getSystemId()); 389 } 390 391 // Dammit, not quite the same as XMLEventFactory 392 String prefix = /*(localname.equals(qname))*/ 393 (qname.indexOf(':') == -1 ) ? "" 394 : qname.substring(0, qname.indexOf(':')); 395 if (settings.contains(Setting.ManualNamespaces)) { 396 getNamespaces(arg3); 397 if (prefix.length() != 0) { 398 arg0 = context.getNamespaceURI(prefix); 399 localname = localname.substring(prefix.length() + 1); 400 } 401 } 402 StartElement e = eventFactory.createStartElement( 403 prefix, arg0, localname, 404 fromAttributes(arg3), null, context); 405 406 if (literalCollector.isCollecting()) literalCollector.handleEvent(e); 407 408 // If we are gathering XML we stop parsing 409 if (!literalCollector.isCollectingXML()) context = parse(context, e); 410 } catch (XMLStreamException ex) { 411 throw new RuntimeException("Streaming issue", ex); 412 } 413 414 } 415 416 public void endElement(String arg0, String localname, String qname) throws SAXException { 417 //System.err.println("End element: " + arg0 + " " + arg1 + " " + arg2); 418 if (literalCollector.isCollecting()) { 419 String prefix = (localname.equals(qname)) ? "" 420 : qname.substring(0, qname.indexOf(':')); 421 XMLEvent e = eventFactory.createEndElement(prefix, arg0, localname); 422 literalCollector.handleEvent(e); 423 } 424 // If we aren't collecting an XML literal keep parsing 425 if (!literalCollector.isCollectingXML()) context = context.parent; 426 } 427 428 public void characters(char[] arg0, int arg1, int arg2) throws SAXException { 429 if (literalCollector.isCollecting()) { 430 XMLEvent e = eventFactory.createCharacters(String.valueOf(arg0, arg1, arg2)); 431 literalCollector.handleEvent(e); 432 } 433 } 434 435 public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { 436 //System.err.println("Whitespace..."); 437 if (literalCollector.isCollecting()) { 438 XMLEvent e = eventFactory.createIgnorableSpace(String.valueOf(arg0, arg1, arg2)); 439 literalCollector.handleEvent(e); 440 } 441 } 442 443 public void processingInstruction(String arg0, String arg1) throws SAXException { 444 } 445 446 public void skippedEntity(String arg0) throws SAXException { 447 } 448 449 private Iterator fromAttributes(Attributes attributes) { 450 List toReturn = new LinkedList(); 451 452 for (int i = 0; i < attributes.getLength(); i++) { 453 String qname = attributes.getQName(i); 454 String prefix = qname.contains(":") ? qname.substring(0, qname.indexOf(":")) : ""; 455 Attribute attr = eventFactory.createAttribute( 456 prefix, attributes.getURI(i), 457 attributes.getLocalName(i), attributes.getValue(i)); 458 459 if (!qname.equals("xmlns") && !qname.startsWith("xmlns:")) 460 toReturn.add(attr); 461 } 462 463 return toReturn.iterator(); 464 } 465 466 // 1.1 method 467 468 private void parsePrefixes(String value, EvalContext context) { 469 String[] parts = value.split("\\s+"); 470 for (int i = 0; i < parts.length; i += 2) { 471 String prefix = parts[i]; 472 if (i + 1 < parts.length && prefix.endsWith(":")) { 473 String prefixFix = prefix.substring(0, prefix.length() - 1); 474 context.setPrefix(prefixFix, parts[i+1]); 475 sink.addPrefix(prefixFix, parts[i+1]); 476 } 477 } 478 } 479 } 480 481 /* 482 * (c) Copyright 2009 University of Bristol 483 * All rights reserved. 484 * 485 * Redistribution and use in source and binary forms, with or without 486 * modification, are permitted provided that the following conditions 487 * are met: 488 * 1. Redistributions of source code must retain the above copyright 489 * notice, this list of conditions and the following disclaimer. 490 * 2. Redistributions in binary form must reproduce the above copyright 491 * notice, this list of conditions and the following disclaimer in the 492 * documentation and/or other materials provided with the distribution. 493 * 3. The name of the author may not be used to endorse or promote products 494 * derived from this software without specific prior written permission. 495 * 496 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 497 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 498 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 499 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 500 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 501 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 502 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 503 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 504 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 505 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 506 */