001 /*
002 * (c) Copyright 2009 University of Bristol
003 * All rights reserved.
004 * [See end of file]
005 */
006 package net.rootdev.javardfa;
007
008 import net.rootdev.javardfa.uri.URIExtractor10;
009 import net.rootdev.javardfa.uri.URIExtractor;
010 import net.rootdev.javardfa.uri.IRIResolver;
011 import net.rootdev.javardfa.literal.LiteralCollector;
012 import java.util.Collection;
013 import java.util.EnumSet;
014 import java.util.Iterator;
015 import java.util.LinkedList;
016 import java.util.List;
017 import java.util.Set;
018 import javax.xml.namespace.QName;
019 import javax.xml.stream.XMLEventFactory;
020 import javax.xml.stream.XMLOutputFactory;
021 import javax.xml.stream.XMLStreamException;
022 import javax.xml.stream.events.Attribute;
023 import javax.xml.stream.events.StartElement;
024 import javax.xml.stream.events.XMLEvent;
025 import org.xml.sax.Attributes;
026 import org.xml.sax.ContentHandler;
027 import org.xml.sax.Locator;
028 import org.xml.sax.SAXException;
029
030 /**
031 * @author Damian Steer <pldms@mac.com>
032 */
033 public class Parser implements ContentHandler {
034
035 private final XMLEventFactory eventFactory;
036 private final StatementSink sink;
037 private final Set<Setting> settings;
038 private final LiteralCollector literalCollector;
039 private final URIExtractor extractor;
040 private final ProfileCollector profileCollector;
041
042 public Parser(StatementSink sink) {
043 this( sink,
044 XMLOutputFactory.newInstance(),
045 XMLEventFactory.newInstance(),
046 new URIExtractor10(new IRIResolver()),
047 ProfileCollector.EMPTY_COLLECTOR);
048 }
049
050 public Parser(StatementSink sink,
051 XMLOutputFactory outputFactory,
052 XMLEventFactory eventFactory,
053 URIExtractor extractor,
054 ProfileCollector profileCollector) {
055 this.sink = sink;
056 this.eventFactory = eventFactory;
057 this.settings = EnumSet.noneOf(Setting.class);
058 this.extractor = extractor;
059 this.literalCollector = new LiteralCollector(this, eventFactory, outputFactory);
060 this.profileCollector = profileCollector;
061
062 extractor.setSettings(settings);
063
064 // Important, although I guess the caller doesn't get total control
065 outputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true);
066 }
067
068 public void enable(Setting setting) {
069 settings.add(setting);
070 }
071
072 public void disable(Setting setting) {
073 settings.remove(setting);
074 }
075
076 public void setBase(String base) {
077 this.context = new EvalContext(base);
078 sink.setBase(context.getBase());
079 }
080
081 EvalContext parse(EvalContext context, StartElement element)
082 throws XMLStreamException {
083 boolean skipElement = false;
084 String newSubject = null;
085 String currentObject = null;
086 List<String> forwardProperties = new LinkedList();
087 List<String> backwardProperties = new LinkedList();
088 String currentLanguage = context.language;
089
090 if (settings.contains(Setting.OnePointOne)) {
091
092 if (element.getAttributeByName(Constants.vocab) != null) {
093 context.vocab =
094 element.getAttributeByName(Constants.vocab).getValue().trim();
095 }
096
097 if (element.getAttributeByName(Constants.prefix) != null) {
098 parsePrefixes(element.getAttributeByName(Constants.prefix).getValue(), context);
099 }
100
101 if (element.getAttributeByName(Constants.profile) != null) {
102 String profileURI = extractor.resolveURI(
103 element.getAttributeByName(Constants.profile).getValue(),
104 context);
105 profileCollector.getProfile(
106 profileURI,
107 context);
108 }
109 }
110
111 // The xml / html namespace matching is a bit ropey. I wonder if the html 5
112 // parser has a setting for this?
113 if (settings.contains(Setting.ManualNamespaces)) {
114 if (element.getAttributeByName(Constants.xmllang) != null) {
115 currentLanguage = element.getAttributeByName(Constants.xmllang).getValue();
116 if (currentLanguage.length() == 0) currentLanguage = null;
117 } else if (element.getAttributeByName(Constants.lang) != null) {
118 currentLanguage = element.getAttributeByName(Constants.lang).getValue();
119 if (currentLanguage.length() == 0) currentLanguage = null;
120 }
121 } else if (element.getAttributeByName(Constants.xmllangNS) != null) {
122 currentLanguage = element.getAttributeByName(Constants.xmllangNS).getValue();
123 if (currentLanguage.length() == 0) currentLanguage = null;
124 }
125
126 if (Constants.base.equals(element.getName()) &&
127 element.getAttributeByName(Constants.href) != null) {
128 context.setBase(element.getAttributeByName(Constants.href).getValue());
129 sink.setBase(context.getBase());
130 }
131
132 if (element.getAttributeByName(Constants.rev) == null &&
133 element.getAttributeByName(Constants.rel) == null) {
134 Attribute nSubj = findAttribute(element, Constants.about, Constants.src,
135 Constants.resource, Constants.href);
136 if (nSubj != null) {
137 newSubject = extractor.getURI(element, nSubj, context);
138 }
139 if (newSubject == null) {
140 if (Constants.body.equals(element.getName()) ||
141 Constants.head.equals(element.getName())) {
142 newSubject = context.base;
143 }
144 else if (element.getAttributeByName(Constants.typeof) != null) {
145 newSubject = createBNode();
146 } else {
147 if (context.parentObject != null) {
148 newSubject = context.parentObject;
149 }
150 if (element.getAttributeByName(Constants.property) == null) {
151 skipElement = true;
152 }
153 }
154 }
155 } else {
156 Attribute nSubj = findAttribute(element, Constants.about, Constants.src);
157 if (nSubj != null) {
158 newSubject = extractor.getURI(element, nSubj, context);
159 }
160 if (newSubject == null) {
161 // if element is head or body assume about=""
162 if (Constants.head.equals(element.getName()) ||
163 Constants.body.equals(element.getName())) {
164 newSubject = context.base;
165 } else if (element.getAttributeByName(Constants.typeof) != null) {
166 newSubject = createBNode();
167 } else if (context.parentObject != null) {
168 newSubject = context.parentObject;
169 }
170 }
171 Attribute cObj = findAttribute(element, Constants.resource, Constants.href);
172 if (cObj != null) {
173 currentObject = extractor.getURI(element, cObj, context);
174 }
175 }
176
177 if (newSubject != null && element.getAttributeByName(Constants.typeof) != null) {
178 List<String> types = extractor.getURIs(element,
179 element.getAttributeByName(Constants.typeof), context);
180 for (String type : types) {
181 emitTriples(newSubject,
182 Constants.rdfType,
183 type);
184 }
185 }
186
187 // Dodgy extension
188 if (settings.contains(Setting.FormMode)) {
189 if (Constants.form.equals(element.getName())) {
190 emitTriples(newSubject, Constants.rdfType, "http://www.w3.org/1999/xhtml/vocab/#form"); // Signal entering form
191 }
192 if (Constants.input.equals(element.getName()) &&
193 element.getAttributeByName(Constants.name) != null) {
194 currentObject = "?" + element.getAttributeByName(Constants.name).getValue();
195 }
196
197 }
198
199 if (currentObject != null) {
200 if (element.getAttributeByName(Constants.rel) != null) {
201 emitTriples(newSubject,
202 extractor.getURIs(element,
203 element.getAttributeByName(Constants.rel), context),
204 currentObject);
205 }
206 if (element.getAttributeByName(Constants.rev) != null) {
207 emitTriples(currentObject,
208 extractor.getURIs(element, element.getAttributeByName(Constants.rev), context),
209 newSubject);
210 }
211 } else {
212 if (element.getAttributeByName(Constants.rel) != null) {
213 forwardProperties.addAll(extractor.getURIs(element,
214 element.getAttributeByName(Constants.rel), context));
215 }
216 if (element.getAttributeByName(Constants.rev) != null) {
217 backwardProperties.addAll(extractor.getURIs(element,
218 element.getAttributeByName(Constants.rev), context));
219 }
220 if (!forwardProperties.isEmpty() || !backwardProperties.isEmpty()) {
221 // if predicate present
222 currentObject = createBNode();
223 }
224 }
225
226 // Getting literal values. Complicated!
227 if (element.getAttributeByName(Constants.property) != null) {
228 List<String> props = extractor.getURIs(element,
229 element.getAttributeByName(Constants.property), context);
230 String dt = getDatatype(element);
231 if (element.getAttributeByName(Constants.content) != null) { // The easy bit
232 String lex = element.getAttributeByName(Constants.content).getValue();
233 if (dt == null || dt.length() == 0) {
234 emitTriplesPlainLiteral(newSubject, props, lex, currentLanguage);
235 } else {
236 emitTriplesDatatypeLiteral(newSubject, props, lex, dt);
237 }
238 } else {
239 literalCollector.collect(newSubject, props, dt, currentLanguage);
240 }
241 }
242
243 if (!skipElement && newSubject != null) {
244 emitTriples(context.parentSubject,
245 context.forwardProperties,
246 newSubject);
247
248 emitTriples(newSubject,
249 context.backwardProperties,
250 context.parentSubject);
251 }
252
253 EvalContext ec = new EvalContext(context);
254 if (skipElement) {
255 ec.language = currentLanguage;
256 } else {
257 if (newSubject != null) {
258 ec.parentSubject = newSubject;
259 } else {
260 ec.parentSubject = context.parentSubject;
261 }
262
263 if (currentObject != null) {
264 ec.parentObject = currentObject;
265 } else if (newSubject != null) {
266 ec.parentObject = newSubject;
267 } else {
268 ec.parentObject = context.parentSubject;
269 }
270
271 ec.language = currentLanguage;
272 ec.forwardProperties = forwardProperties;
273 ec.backwardProperties = backwardProperties;
274 }
275 return ec;
276 }
277
278 private Attribute findAttribute(StartElement element, QName... names) {
279 for (QName aName : names) {
280 Attribute a = element.getAttributeByName(aName);
281 if (a != null) {
282 return a;
283 }
284 }
285 return null;
286 }
287
288 public void emitTriples(String subj, Collection<String> props, String obj) {
289 for (String prop : props) {
290 sink.addObject(subj, prop, obj);
291 }
292 }
293
294 public void emitTriplesPlainLiteral(String subj, Collection<String> props, String lex, String language) {
295 for (String prop : props) {
296 sink.addLiteral(subj, prop, lex, language, null);
297 }
298 }
299
300 public void emitTriplesDatatypeLiteral(String subj, Collection<String> props, String lex, String datatype) {
301 for (String prop : props) {
302 sink.addLiteral(subj, prop, lex, null, datatype);
303 }
304 }
305
306 int bnodeId = 0;
307
308 private String createBNode() // TODO probably broken? Can you write bnodes in rdfa directly?
309 {
310 return "_:node" + (bnodeId++);
311 }
312
313 private String getDatatype(StartElement element) {
314 Attribute de = element.getAttributeByName(Constants.datatype);
315 if (de == null) {
316 return null;
317 }
318 String dt = de.getValue();
319 if (dt.length() == 0) {
320 return dt;
321 }
322 return extractor.expandCURIE(element, dt, context);
323 }
324
325 private void getNamespaces(Attributes attrs) {
326 for (int i = 0; i < attrs.getLength(); i++) {
327 String qname = attrs.getQName(i);
328 String prefix = getPrefix(qname);
329 if ("xmlns".equals(prefix)) {
330 String pre = getLocal(prefix, qname);
331 String uri = attrs.getValue(i);
332 if (!settings.contains(Setting.ManualNamespaces) && pre.contains("_"))
333 continue; // not permitted
334 context.setNamespaceURI(pre, uri);
335 sink.addPrefix(pre, uri);
336 }
337 }
338 }
339
340 private String getPrefix(String qname) {
341 if (!qname.contains(":")) {
342 return "";
343 }
344 return qname.substring(0, qname.indexOf(":"));
345 }
346
347 private String getLocal(String prefix, String qname) {
348 if (prefix.length() == 0) {
349 return qname;
350 }
351 return qname.substring(prefix.length() + 1);
352 }
353 /**
354 * SAX methods
355 */
356 private Locator locator;
357 private EvalContext context;
358
359 public void setDocumentLocator(Locator arg0) {
360 this.locator = arg0;
361 if (locator.getSystemId() != null)
362 this.setBase(arg0.getSystemId());
363 }
364
365 public void startDocument() throws SAXException {
366 sink.start();
367 }
368
369 public void endDocument() throws SAXException {
370 sink.end();
371 }
372
373 public void startPrefixMapping(String arg0, String arg1)
374 throws SAXException {
375 context.setNamespaceURI(arg0, arg1);
376 sink.addPrefix(arg0, arg1);
377 }
378
379 public void endPrefixMapping(String arg0) throws SAXException {
380 }
381
382 public void startElement(String arg0, String localname, String qname, Attributes arg3) throws SAXException {
383 try {
384 //System.err.println("Start element: " + arg0 + " " + arg1 + " " + arg2);
385
386 // This is set very late in some html5 cases (not even ready by document start)
387 if (context == null) {
388 this.setBase(locator.getSystemId());
389 }
390
391 // Dammit, not quite the same as XMLEventFactory
392 String prefix = /*(localname.equals(qname))*/
393 (qname.indexOf(':') == -1 ) ? ""
394 : qname.substring(0, qname.indexOf(':'));
395 if (settings.contains(Setting.ManualNamespaces)) {
396 getNamespaces(arg3);
397 if (prefix.length() != 0) {
398 arg0 = context.getNamespaceURI(prefix);
399 localname = localname.substring(prefix.length() + 1);
400 }
401 }
402 StartElement e = eventFactory.createStartElement(
403 prefix, arg0, localname,
404 fromAttributes(arg3), null, context);
405
406 if (literalCollector.isCollecting()) literalCollector.handleEvent(e);
407
408 // If we are gathering XML we stop parsing
409 if (!literalCollector.isCollectingXML()) context = parse(context, e);
410 } catch (XMLStreamException ex) {
411 throw new RuntimeException("Streaming issue", ex);
412 }
413
414 }
415
416 public void endElement(String arg0, String localname, String qname) throws SAXException {
417 //System.err.println("End element: " + arg0 + " " + arg1 + " " + arg2);
418 if (literalCollector.isCollecting()) {
419 String prefix = (localname.equals(qname)) ? ""
420 : qname.substring(0, qname.indexOf(':'));
421 XMLEvent e = eventFactory.createEndElement(prefix, arg0, localname);
422 literalCollector.handleEvent(e);
423 }
424 // If we aren't collecting an XML literal keep parsing
425 if (!literalCollector.isCollectingXML()) context = context.parent;
426 }
427
428 public void characters(char[] arg0, int arg1, int arg2) throws SAXException {
429 if (literalCollector.isCollecting()) {
430 XMLEvent e = eventFactory.createCharacters(String.valueOf(arg0, arg1, arg2));
431 literalCollector.handleEvent(e);
432 }
433 }
434
435 public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
436 //System.err.println("Whitespace...");
437 if (literalCollector.isCollecting()) {
438 XMLEvent e = eventFactory.createIgnorableSpace(String.valueOf(arg0, arg1, arg2));
439 literalCollector.handleEvent(e);
440 }
441 }
442
443 public void processingInstruction(String arg0, String arg1) throws SAXException {
444 }
445
446 public void skippedEntity(String arg0) throws SAXException {
447 }
448
449 private Iterator fromAttributes(Attributes attributes) {
450 List toReturn = new LinkedList();
451
452 for (int i = 0; i < attributes.getLength(); i++) {
453 String qname = attributes.getQName(i);
454 String prefix = qname.contains(":") ? qname.substring(0, qname.indexOf(":")) : "";
455 Attribute attr = eventFactory.createAttribute(
456 prefix, attributes.getURI(i),
457 attributes.getLocalName(i), attributes.getValue(i));
458
459 if (!qname.equals("xmlns") && !qname.startsWith("xmlns:"))
460 toReturn.add(attr);
461 }
462
463 return toReturn.iterator();
464 }
465
466 // 1.1 method
467
468 private void parsePrefixes(String value, EvalContext context) {
469 String[] parts = value.split("\\s+");
470 for (int i = 0; i < parts.length; i += 2) {
471 String prefix = parts[i];
472 if (i + 1 < parts.length && prefix.endsWith(":")) {
473 String prefixFix = prefix.substring(0, prefix.length() - 1);
474 context.setPrefix(prefixFix, parts[i+1]);
475 sink.addPrefix(prefixFix, parts[i+1]);
476 }
477 }
478 }
479 }
480
481 /*
482 * (c) Copyright 2009 University of Bristol
483 * All rights reserved.
484 *
485 * Redistribution and use in source and binary forms, with or without
486 * modification, are permitted provided that the following conditions
487 * are met:
488 * 1. Redistributions of source code must retain the above copyright
489 * notice, this list of conditions and the following disclaimer.
490 * 2. Redistributions in binary form must reproduce the above copyright
491 * notice, this list of conditions and the following disclaimer in the
492 * documentation and/or other materials provided with the distribution.
493 * 3. The name of the author may not be used to endorse or promote products
494 * derived from this software without specific prior written permission.
495 *
496 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
497 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
498 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
499 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
500 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
501 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
502 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
503 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
504 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
505 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
506 */