001    /*
002     * (c) Copyright 2009 University of Bristol
003     * All rights reserved.
004     * [See end of file]
005     */
006    package net.rootdev.javardfa;
007    
008    import net.rootdev.javardfa.uri.URIExtractor10;
009    import net.rootdev.javardfa.uri.URIExtractor;
010    import net.rootdev.javardfa.uri.IRIResolver;
011    import net.rootdev.javardfa.literal.LiteralCollector;
012    import java.util.Collection;
013    import java.util.EnumSet;
014    import java.util.Iterator;
015    import java.util.LinkedList;
016    import java.util.List;
017    import java.util.Set;
018    import javax.xml.namespace.QName;
019    import javax.xml.stream.XMLEventFactory;
020    import javax.xml.stream.XMLOutputFactory;
021    import javax.xml.stream.XMLStreamException;
022    import javax.xml.stream.events.Attribute;
023    import javax.xml.stream.events.StartElement;
024    import javax.xml.stream.events.XMLEvent;
025    import org.xml.sax.Attributes;
026    import org.xml.sax.ContentHandler;
027    import org.xml.sax.Locator;
028    import org.xml.sax.SAXException;
029    
030    /**
031     * @author Damian Steer <pldms@mac.com>
032     */
033    public class Parser implements ContentHandler {
034    
035        private final XMLEventFactory eventFactory;
036        private final StatementSink sink;
037        private final Set<Setting> settings;
038        private final LiteralCollector literalCollector;
039        private final URIExtractor extractor;
040        private final ProfileCollector profileCollector;
041    
042        public Parser(StatementSink sink) {
043            this(   sink,
044                    XMLOutputFactory.newInstance(),
045                    XMLEventFactory.newInstance(),
046                    new URIExtractor10(new IRIResolver()),
047                    ProfileCollector.EMPTY_COLLECTOR);
048        }
049    
050        public Parser(StatementSink sink,
051                XMLOutputFactory outputFactory,
052                XMLEventFactory eventFactory,
053                URIExtractor extractor,
054                ProfileCollector profileCollector) {
055            this.sink = sink;
056            this.eventFactory = eventFactory;
057            this.settings = EnumSet.noneOf(Setting.class);
058            this.extractor = extractor;
059            this.literalCollector = new LiteralCollector(this, eventFactory, outputFactory);
060            this.profileCollector = profileCollector;
061    
062            extractor.setSettings(settings);
063    
064            // Important, although I guess the caller doesn't get total control
065            outputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true);
066        }
067    
068        public void enable(Setting setting) {
069            settings.add(setting);
070        }
071    
072        public void disable(Setting setting) {
073            settings.remove(setting);
074        }
075    
076        public void setBase(String base) {
077            this.context = new EvalContext(base);
078            sink.setBase(context.getBase());
079        }
080    
081        EvalContext parse(EvalContext context, StartElement element)
082                throws XMLStreamException {
083            boolean skipElement = false;
084            String newSubject = null;
085            String currentObject = null;
086            List<String> forwardProperties = new LinkedList();
087            List<String> backwardProperties = new LinkedList();
088            String currentLanguage = context.language;
089    
090            if (settings.contains(Setting.OnePointOne)) {
091    
092                if (element.getAttributeByName(Constants.vocab) != null) {
093                    context.vocab =
094                        element.getAttributeByName(Constants.vocab).getValue().trim();
095                }
096    
097                if (element.getAttributeByName(Constants.prefix) != null) {
098                    parsePrefixes(element.getAttributeByName(Constants.prefix).getValue(), context);
099                }
100    
101                if (element.getAttributeByName(Constants.profile) != null) {
102                    String profileURI = extractor.resolveURI(
103                            element.getAttributeByName(Constants.profile).getValue(),
104                            context);
105                    profileCollector.getProfile(
106                            profileURI,
107                            context);
108                }
109            }
110    
111            // The xml / html namespace matching is a bit ropey. I wonder if the html 5
112            // parser has a setting for this?
113            if (settings.contains(Setting.ManualNamespaces)) {
114                if (element.getAttributeByName(Constants.xmllang) != null) {
115                    currentLanguage = element.getAttributeByName(Constants.xmllang).getValue();
116                    if (currentLanguage.length() == 0) currentLanguage = null;
117                } else if (element.getAttributeByName(Constants.lang) != null) {
118                    currentLanguage = element.getAttributeByName(Constants.lang).getValue();
119                    if (currentLanguage.length() == 0) currentLanguage = null;
120                }
121            } else if (element.getAttributeByName(Constants.xmllangNS) != null) {
122                currentLanguage = element.getAttributeByName(Constants.xmllangNS).getValue();
123                if (currentLanguage.length() == 0) currentLanguage = null;
124            }
125    
126            if (Constants.base.equals(element.getName()) &&
127                    element.getAttributeByName(Constants.href) != null) {
128                context.setBase(element.getAttributeByName(Constants.href).getValue());
129                sink.setBase(context.getBase());
130            }
131    
132            if (element.getAttributeByName(Constants.rev) == null &&
133                    element.getAttributeByName(Constants.rel) == null) {
134                Attribute nSubj = findAttribute(element, Constants.about, Constants.src,
135                        Constants.resource, Constants.href);
136                if (nSubj != null) {
137                    newSubject = extractor.getURI(element, nSubj, context);
138                }
139                if (newSubject == null) {
140                    if (Constants.body.equals(element.getName()) ||
141                                Constants.head.equals(element.getName())) {
142                        newSubject = context.base;
143                    }
144                    else if (element.getAttributeByName(Constants.typeof) != null) {
145                        newSubject = createBNode();
146                    } else {
147                        if (context.parentObject != null) {
148                            newSubject = context.parentObject;
149                        }
150                        if (element.getAttributeByName(Constants.property) == null) {
151                            skipElement = true;
152                        }
153                    }
154                }
155            } else {
156                Attribute nSubj = findAttribute(element, Constants.about, Constants.src);
157                if (nSubj != null) {
158                    newSubject = extractor.getURI(element, nSubj, context);
159                }
160                if (newSubject == null) {
161                    // if element is head or body assume about=""
162                    if (Constants.head.equals(element.getName()) ||
163                            Constants.body.equals(element.getName())) {
164                        newSubject = context.base;
165                    } else if (element.getAttributeByName(Constants.typeof) != null) {
166                        newSubject = createBNode();
167                    } else if (context.parentObject != null) {
168                        newSubject = context.parentObject;
169                    }
170                }
171                Attribute cObj = findAttribute(element, Constants.resource, Constants.href);
172                if (cObj != null) {
173                    currentObject = extractor.getURI(element, cObj, context);
174                }
175            }
176    
177            if (newSubject != null && element.getAttributeByName(Constants.typeof) != null) {
178                List<String> types = extractor.getURIs(element,
179                        element.getAttributeByName(Constants.typeof), context);
180                for (String type : types) {
181                    emitTriples(newSubject,
182                            Constants.rdfType,
183                            type);
184                }
185            }
186    
187            // Dodgy extension
188            if (settings.contains(Setting.FormMode)) {
189                if (Constants.form.equals(element.getName())) {
190                    emitTriples(newSubject, Constants.rdfType, "http://www.w3.org/1999/xhtml/vocab/#form"); // Signal entering form
191                }
192                if (Constants.input.equals(element.getName()) &&
193                        element.getAttributeByName(Constants.name) != null) {
194                    currentObject = "?" + element.getAttributeByName(Constants.name).getValue();
195                }
196    
197            }
198    
199            if (currentObject != null) {
200                if (element.getAttributeByName(Constants.rel) != null) {
201                    emitTriples(newSubject,
202                            extractor.getURIs(element,
203                                element.getAttributeByName(Constants.rel), context),
204                            currentObject);
205                }
206                if (element.getAttributeByName(Constants.rev) != null) {
207                    emitTriples(currentObject,
208                            extractor.getURIs(element, element.getAttributeByName(Constants.rev), context),
209                            newSubject);
210                }
211            } else {
212                if (element.getAttributeByName(Constants.rel) != null) {
213                    forwardProperties.addAll(extractor.getURIs(element,
214                            element.getAttributeByName(Constants.rel), context));
215                }
216                if (element.getAttributeByName(Constants.rev) != null) {
217                    backwardProperties.addAll(extractor.getURIs(element,
218                            element.getAttributeByName(Constants.rev), context));
219                }
220                if (!forwardProperties.isEmpty() || !backwardProperties.isEmpty()) {
221                    // if predicate present
222                    currentObject = createBNode();
223                }
224            }
225    
226            // Getting literal values. Complicated!
227            if (element.getAttributeByName(Constants.property) != null) {
228                List<String> props = extractor.getURIs(element,
229                        element.getAttributeByName(Constants.property), context);
230                String dt = getDatatype(element);
231                if (element.getAttributeByName(Constants.content) != null) { // The easy bit
232                    String lex = element.getAttributeByName(Constants.content).getValue();
233                    if (dt == null || dt.length() == 0) {
234                        emitTriplesPlainLiteral(newSubject, props, lex, currentLanguage);
235                    } else {
236                        emitTriplesDatatypeLiteral(newSubject, props, lex, dt);
237                    }
238                } else {
239                    literalCollector.collect(newSubject, props, dt, currentLanguage);
240                }
241            }
242    
243            if (!skipElement && newSubject != null) {
244                emitTriples(context.parentSubject,
245                        context.forwardProperties,
246                        newSubject);
247    
248                emitTriples(newSubject,
249                        context.backwardProperties,
250                        context.parentSubject);
251            }
252    
253            EvalContext ec = new EvalContext(context);
254            if (skipElement) {
255                ec.language = currentLanguage;
256            } else {
257                if (newSubject != null) {
258                    ec.parentSubject = newSubject;
259                } else {
260                    ec.parentSubject = context.parentSubject;
261                }
262    
263                if (currentObject != null) {
264                    ec.parentObject = currentObject;
265                } else if (newSubject != null) {
266                    ec.parentObject = newSubject;
267                } else {
268                    ec.parentObject = context.parentSubject;
269                }
270    
271                ec.language = currentLanguage;
272                ec.forwardProperties = forwardProperties;
273                ec.backwardProperties = backwardProperties;
274            }
275            return ec;
276        }
277    
278        private Attribute findAttribute(StartElement element, QName... names) {
279            for (QName aName : names) {
280                Attribute a = element.getAttributeByName(aName);
281                if (a != null) {
282                    return a;
283                }
284            }
285            return null;
286        }
287    
288        public void emitTriples(String subj, Collection<String> props, String obj) {
289            for (String prop : props) {
290                sink.addObject(subj, prop, obj);
291            }
292        }
293    
294        public void emitTriplesPlainLiteral(String subj, Collection<String> props, String lex, String language) {
295            for (String prop : props) {
296                sink.addLiteral(subj, prop, lex, language, null);
297            }
298        }
299    
300        public void emitTriplesDatatypeLiteral(String subj, Collection<String> props, String lex, String datatype) {
301            for (String prop : props) {
302                sink.addLiteral(subj, prop, lex, null, datatype);
303            }
304        }
305    
306        int bnodeId = 0;
307        
308        private String createBNode() // TODO probably broken? Can you write bnodes in rdfa directly?
309        {
310            return "_:node" + (bnodeId++);
311        }
312    
313        private String getDatatype(StartElement element) {
314            Attribute de = element.getAttributeByName(Constants.datatype);
315            if (de == null) {
316                return null;
317            }
318            String dt = de.getValue();
319            if (dt.length() == 0) {
320                return dt;
321            }
322            return extractor.expandCURIE(element, dt, context);
323        }
324    
325        private void getNamespaces(Attributes attrs) {
326            for (int i = 0; i < attrs.getLength(); i++) {
327                String qname = attrs.getQName(i);
328                String prefix = getPrefix(qname);
329                if ("xmlns".equals(prefix)) {
330                    String pre = getLocal(prefix, qname);
331                    String uri = attrs.getValue(i);
332                    if (!settings.contains(Setting.ManualNamespaces) && pre.contains("_"))
333                        continue; // not permitted
334                    context.setNamespaceURI(pre, uri);
335                    sink.addPrefix(pre, uri);
336                }
337            }
338        }
339    
340        private String getPrefix(String qname) {
341            if (!qname.contains(":")) {
342                return "";
343            }
344            return qname.substring(0, qname.indexOf(":"));
345        }
346    
347        private String getLocal(String prefix, String qname) {
348            if (prefix.length() == 0) {
349                return qname;
350            }
351            return qname.substring(prefix.length() + 1);
352        }
353        /**
354         * SAX methods
355         */
356        private Locator locator;
357        private EvalContext context;
358    
359        public void setDocumentLocator(Locator arg0) {
360            this.locator = arg0;
361            if (locator.getSystemId() != null)
362                this.setBase(arg0.getSystemId());
363        }
364    
365        public void startDocument() throws SAXException {
366            sink.start();
367        }
368    
369        public void endDocument() throws SAXException {
370            sink.end();
371        }
372    
373        public void startPrefixMapping(String arg0, String arg1)
374                throws SAXException {
375            context.setNamespaceURI(arg0, arg1);
376            sink.addPrefix(arg0, arg1);
377        }
378    
379        public void endPrefixMapping(String arg0) throws SAXException {
380        }
381    
382        public void startElement(String arg0, String localname, String qname, Attributes arg3) throws SAXException {
383            try {
384                //System.err.println("Start element: " + arg0 + " " + arg1 + " " + arg2);
385    
386                // This is set very late in some html5 cases (not even ready by document start)
387                if (context == null) {
388                    this.setBase(locator.getSystemId());
389                }
390    
391                // Dammit, not quite the same as XMLEventFactory
392                String prefix = /*(localname.equals(qname))*/
393                        (qname.indexOf(':') == -1 ) ? ""
394                        : qname.substring(0, qname.indexOf(':'));
395                if (settings.contains(Setting.ManualNamespaces)) {
396                    getNamespaces(arg3);
397                    if (prefix.length() != 0) {
398                        arg0 = context.getNamespaceURI(prefix);
399                        localname = localname.substring(prefix.length() + 1);
400                    }
401                }
402                StartElement e = eventFactory.createStartElement(
403                        prefix, arg0, localname,
404                        fromAttributes(arg3), null, context);
405    
406                if (literalCollector.isCollecting()) literalCollector.handleEvent(e);
407    
408                // If we are gathering XML we stop parsing
409                if (!literalCollector.isCollectingXML()) context = parse(context, e);
410            } catch (XMLStreamException ex) {
411                throw new RuntimeException("Streaming issue", ex);
412            }
413    
414        }
415    
416        public void endElement(String arg0, String localname, String qname) throws SAXException {
417            //System.err.println("End element: " + arg0 + " " + arg1 + " " + arg2);
418            if (literalCollector.isCollecting()) {
419                String prefix = (localname.equals(qname)) ? ""
420                        : qname.substring(0, qname.indexOf(':'));
421                XMLEvent e = eventFactory.createEndElement(prefix, arg0, localname);
422                literalCollector.handleEvent(e);
423            }
424            // If we aren't collecting an XML literal keep parsing
425            if (!literalCollector.isCollectingXML()) context = context.parent;
426        }
427    
428        public void characters(char[] arg0, int arg1, int arg2) throws SAXException {
429            if (literalCollector.isCollecting()) {
430                XMLEvent e = eventFactory.createCharacters(String.valueOf(arg0, arg1, arg2));
431                literalCollector.handleEvent(e);
432            }
433        }
434    
435        public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
436            //System.err.println("Whitespace...");
437            if (literalCollector.isCollecting()) {
438                XMLEvent e = eventFactory.createIgnorableSpace(String.valueOf(arg0, arg1, arg2));
439                literalCollector.handleEvent(e);
440            }
441        }
442    
443        public void processingInstruction(String arg0, String arg1) throws SAXException {
444        }
445    
446        public void skippedEntity(String arg0) throws SAXException {
447        }
448    
449        private Iterator fromAttributes(Attributes attributes) {
450            List toReturn = new LinkedList();
451            
452            for (int i = 0; i < attributes.getLength(); i++) {
453                String qname = attributes.getQName(i);
454                String prefix = qname.contains(":") ? qname.substring(0, qname.indexOf(":")) : "";
455                Attribute attr = eventFactory.createAttribute(
456                        prefix, attributes.getURI(i),
457                        attributes.getLocalName(i), attributes.getValue(i));
458    
459                if (!qname.equals("xmlns") && !qname.startsWith("xmlns:"))
460                    toReturn.add(attr);
461            }
462            
463            return toReturn.iterator();
464        }
465    
466        // 1.1 method
467    
468        private void parsePrefixes(String value, EvalContext context) {
469            String[] parts = value.split("\\s+");
470            for (int i = 0; i < parts.length; i += 2) {
471                String prefix = parts[i];
472                if (i + 1 < parts.length && prefix.endsWith(":")) {
473                    String prefixFix = prefix.substring(0, prefix.length() - 1);
474                    context.setPrefix(prefixFix, parts[i+1]);
475                    sink.addPrefix(prefixFix, parts[i+1]);
476                }
477            }
478        }
479    }
480    
481    /*
482     * (c) Copyright 2009 University of Bristol
483     * All rights reserved.
484     *
485     * Redistribution and use in source and binary forms, with or without
486     * modification, are permitted provided that the following conditions
487     * are met:
488     * 1. Redistributions of source code must retain the above copyright
489     *    notice, this list of conditions and the following disclaimer.
490     * 2. Redistributions in binary form must reproduce the above copyright
491     *    notice, this list of conditions and the following disclaimer in the
492     *    documentation and/or other materials provided with the distribution.
493     * 3. The name of the author may not be used to endorse or promote products
494     *    derived from this software without specific prior written permission.
495     *
496     * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
497     * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
498     * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
499     * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
500     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
501     * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
502     * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
503     * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
504     * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
505     * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
506     */