Previous Page Next Page

Recipe 9.2. Performing Set Operations on Node Sets Using Value Semantics

Problem

You need to find the union , intersection, set difference, or symmetrical set difference between elements in two node sets; however, in your problem, equality is not defined as node-set identity. In other words, equality is a function of a node's value.

Solution

XSLT 1.0

The need for this solution may arise when working with multiple documents. Consider two documents with the same DTD but content that may not contain duplicate element values. XSLT elements coming from distinct documents are distinct even if they contain elements with the same namespace, attribute, and text values. See Example 9-1 to Example 9-4.

Example 9-1. people1.xslt
<people>
  <person name="Brad York" age="38" sex="m" smoker="yes"/>
  <person name="Charles Xavier" age="32" sex="m" smoker="no"/>
  <person name="David Williams" age="33" sex="m" smoker="no"/>
</people>

Example 9-2. people2.xslt
<people>
  <person name="Al Zehtooney" age="33" sex="m" smoker="no"/>
  <person name="Brad York" age="38" sex="m" smoker="yes"/>
  <person name="Charles Xavier" age="32" sex="m" smoker="no"/>
</people>

Example 9-3. Failed attempt to use XSLT union to select unique people
<xsl:template match="/">
  <people>
    <xsl:copy-of select="//person | document('people2.xml')//person"/>
  </people>
</xsl:template>

Example 9-4. Output when run with people1.xml as input
<people>
   <person name="Brad York" age="38" sex="m" smoker="yes"/>
   <person name="Charles Xavier" age="32" sex="m" smoker="no"/>
   <person name="David Williams" age="33" sex="m" smoker="no"/>
   <person name="Al Zehtooney" age="33" sex="m" smoker="no"/>
   <person name="Brad York" age="38" sex="m" smoker="yes"/>
   <person name="Charles Xavier" age="32" sex="m" smoker="no"/>
</people>

Relying on node identity can also break down in single document cases when you want equality of nodes to be a function of their text or attribute values.

The following stylesheet provides a reusable implementation of union, intersection, and set difference based on value semantics. The idea is that a stylesheet importing this one will override the template whose mode="vset:element-equality". This allows the importing stylesheet to define whatever equality semantics make sense for the given input:

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 xmlns:vset="http:/www.ora.com/XSLTCookbook/namespaces/vset">
 
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
   
<!-- The default implementation of element equality. Override in the importing 
stylesheet as necessary. -->
<xsl:template match="node( ) | @*" mode="vset:element-equality">
  <xsl:param name="other"/>
  <xsl:if test=". = $other">  
    <xsl:value-of select="true( )"/>
  </xsl:if>
</xsl:template>
   
<!-- The default set membership test uses element equality. You will rarely need to 
override this in the importing stylesheet. -->
<xsl:template match="node( ) | @*" mode="vset:member-of">
  <xsl:param name="elem"/>
  <xsl:variable name="member-of">
    <xsl:for-each select=".">
      <xsl:apply-templates select="." mode="vset:element-equality">
        <xsl:with-param name="other" select="$elem"/>
      </xsl:apply-templates>
    </xsl:for-each>
  </xsl:variable>
  <xsl:value-of select="string($member-of)"/>
</xsl:template>
   
<!-- Compute the union of two sets using "by value" equality. -->
<xsl:template name="vset:union">
  <xsl:param name="nodes1" select="/.." />
  <xsl:param name="nodes2" select="/.." />
  <!-- for internal use -->
  <xsl:param name="nodes" select="$nodes1 | $nodes2" />
  <xsl:param name="union" select="/.." />
  <xsl:choose>
    <xsl:when test="$nodes">
      <xsl:variable name="test">
        <xsl:apply-templates select="$union" mode="vset:member-of">
          <xsl:with-param name="elem" select="$nodes[1]" />
        </xsl:apply-templates>
      </xsl:variable>
      <xsl:call-template name="vset:union">
        <xsl:with-param name="nodes" select="$nodes[position( ) > 1]" />
        <xsl:with-param name="union" 
                        select="$union | $nodes[1][not(string($test))]" />
      </xsl:call-template>
    </xsl:when>
    <xsl:otherwise>
      <xsl:apply-templates select="$union" mode="vset:union" />
    </xsl:otherwise>
  </xsl:choose>
</xsl:template>
   
<!-- Return a copy of union by default. Override in importing stylesheet to receive
reults as a "callback"-->
<xsl:template match="/ | node( ) | @*" mode="vset:union">
  <xsl:copy-of select="."/>
</xsl:template>
   
<!-- Compute the intersection of two sets using "by value" equality. -->
<xsl:template name="vset:intersection">
  <xsl:param name="nodes1" select="/.."/>
  <xsl:param name="nodes2" select="/.."/>
  <!-- For internal use -->
  <xsl:param name="intersect" select="/.."/>
  
  <xsl:choose>
    <xsl:when test="not($nodes1)">
      <xsl:apply-templates select="$intersect" mode="vset:intersection"/>
    </xsl:when>
    <xsl:when test="not($nodes2)">
      <xsl:apply-templates select="$intersect" mode="vset:intersection"/>
    </xsl:when>
    <xsl:otherwise>
      <xsl:variable name="test1">
        <xsl:apply-templates select="$nodes2" mode="vset:member-of">
          <xsl:with-param name="elem" select="$nodes1[1]"/>
        </xsl:apply-templates>
      </xsl:variable>
      <xsl:variable name="test2">
        <xsl:apply-templates select="$intersect" mode="vset:member-of">
          <xsl:with-param name="elem" select="$nodes1[1]"/>
        </xsl:apply-templates>
      </xsl:variable>
      <xsl:choose>
        <xsl:when test="string($test1) and not(string($test2))">
          <xsl:call-template name="vset:intersection">
            <xsl:with-param name="nodes1" 
                    select="$nodes1[position( ) > 1]"/>
            <xsl:with-param name="nodes2" select="$nodes2"/>
            <xsl:with-param name="intersect" 
                    select="$intersect | $nodes1[1]"/>
          </xsl:call-template>
        </xsl:when>
        <xsl:otherwise>
          <xsl:call-template name="vset:intersection">
            <xsl:with-param name="nodes1" 
                    select="$nodes1[position( ) > 1]"/>
            <xsl:with-param name="nodes2" select="$nodes2"/>
            <xsl:with-param name="intersect" select="$intersect"/>
          </xsl:call-template>
        </xsl:otherwise>
      </xsl:choose>
    </xsl:otherwise>
  </xsl:choose>
</xsl:template>
   
<!-- Return a copy of intersection by default. Override in importing stylesheet to
receive results as a "callback"-->
<xsl:template match="/ | node( ) | @*" mode="vset:intersection">
  <xsl:copy-of select="."/>
</xsl:template>
   
<!-- Compute the differnce between two sets (node1 - nodes2) using "by value" 
equality. -->
<xsl:template name="vset:difference">
  <xsl:param name="nodes1" select="/.."/>
  <xsl:param name="nodes2" select="/.."/>
  <!-- For internal use -->
  <xsl:param name="difference" select="/.."/>
  
  <xsl:choose>
    <xsl:when test="not($nodes1)">
      <xsl:apply-templates select="$difference" mode="vset:difference"/>
    </xsl:when>
    <xsl:when test="not($nodes2)">
      <xsl:apply-templates select="$nodes1" mode="vset:difference"/>
    </xsl:when>
    <xsl:otherwise>
      <xsl:variable name="test1">
        <xsl:apply-templates select="$nodes2" mode="vset:member-of">
          <xsl:with-param name="elem" select="$nodes1[1]"/>
        </xsl:apply-templates>
      </xsl:variable>
      <xsl:variable name="test2">
        <xsl:apply-templates select="$difference" mode="vset:member-of">
          <xsl:with-param name="elem" select="$nodes1[1]"/>
        </xsl:apply-templates>
      </xsl:variable>
      <xsl:choose>
        <xsl:when test="string($test1) or string($test2)">
          <xsl:call-template name="vset:difference">
            <xsl:with-param name="nodes1" 
                    select="$nodes1[position( ) > 1]"/>
            <xsl:with-param name="nodes2" select="$nodes2"/>
            <xsl:with-param name="difference" select="$difference"/>
          </xsl:call-template>
        </xsl:when>
        <xsl:otherwise>
          <xsl:call-template name="vset:difference">
            <xsl:with-param name="nodes1" 
                    select="$nodes1[position( ) > 1]"/>
            <xsl:with-param name="nodes2" select="$nodes2"/>
            <xsl:with-param name="difference" 
                    select="$difference | $nodes1[1]"/>
          </xsl:call-template>
        </xsl:otherwise>
      </xsl:choose>
    </xsl:otherwise>
  </xsl:choose>
</xsl:template>
   
<!-- Return a copy of difference by default. Override in importing stylesheet to
receive results as a "callback"-->
<xsl:template match="/ | node( ) | @*" mode="vset:difference">
  <xsl:copy-of select="."/>
</xsl:template>

These recursive templates are implemented in terms of the following definitions:


Union(nodes1,nodes2)

The union includes everything in nodes2 plus everything in nodes1 not already a member of nodes2.


Intersection(nodes1,nodes2)

The intersection includes everything in nodes1 that is also a member of nodes2.


Difference(nodes1,nodes2)

The difference includes everything in nodes1 that is not also a member of nodes2.

In all cases, membership defaults to equality of string values, but the importing stylesheet can override this default.

Given these value-oriented set operations, you can achieve the desired effect on people1.xml and people2.xml using the following stylesheet:

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 xmlns:vset="http:/www.ora.com/XSLTCookbook/namespaces/vset">
   
<xsl:import href="set.ops.xslt"/>
   
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
   
<xsl:template match="/">
  <people>
    <xsl:call-template name="vset:union">
      <xsl:with-param name="nodes1" select="//person"/>
      <xsl:with-param name="nodes2" select="document('people2.xml')//person"/>
    </xsl:call-template>
  </people>
</xsl:template>
   
<!--Define person equality as having the same name -->
<xsl:template match="person" mode="vset:element-equality">
  <xsl:param name="other"/>
  <xsl:if test="@name = $other/@name">  
    <xsl:value-of select="true( )"/>
  </xsl:if>
</xsl:template>
   
</xsl:stylesheet>

XSLT 2.0

The main enhancement of XSLT 2.0 is to take advantage of the availability of first-class functions and sequences. This eliminates the need for recursion, the call-back trick, and leads to cleaner definitions and usage. The functions vset:element-equality and vset:member-of can still be overridden in importing stylesheets to customize behavior.

<xsl:stylesheet version="2.0" 
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
xmlns:xs="http://www.w3.org/2001/XMLSchema" 
xmlns:vset="http:/www.ora.com/XSLTCookbook/namespaces/vset">

<!-- The default implementation of element equality. Override in the importing 
stylesheet as necessary. -->
<xsl:function name="vset:element-equality" as="xs:boolean">
  <xsl:param name="item1" as="item( )?"/>
  <xsl:param name="item2" as="item( )?"/>
  <xsl:sequence select="$item1 = $item2"/>
</xsl:function>
   
<!-- The default set membership test uses element equality. You will rarely need to 
override this in the importing stylesheet. -->
<xsl:function name="vset:member-of" as="xs:boolean">
  <xsl:param name="set" as="item( )*"/>
  <xsl:param name="elem" as="item( )"/>
  <xsl:variable name="member-of" as="xs:boolean*" 
                       select="for $test in $set 
                                    return if (vset:element-equality($test, $elem)) 
                                               then true( ) else ( )"/>
  <xsl:sequence select="not(empty($member-of))"/>
</xsl:function>
   
<!-- Compute the union of two sets using "by value" equality. -->
<xsl:function name="vset:union" as="item( )*">
  <xsl:param name="nodes1" as="item( )*"  />
  <xsl:param name="nodes2" as="item( )*" />
 <xsl:sequence select="$nodes1, for $test in $nodes2 
                                return if (vset:member-of($nodes1,$test)) 
                                       then ( ) else $test"/>  
</xsl:function>
   
   
<!-- Compute the intersection of two sets using "by value" equality. -->
<xsl:function name="vset:intersection" as="item( )*">
  <xsl:param name="nodes1" as="item( )*" />
  <xsl:param name="nodes2" as="item( )*" />
 <xsl:sequence select="for $test in $nodes1 
                       return if (vset:member-of($nodes2,$test)) 
                              then $test else ( )"/>  
</xsl:function>
   
<!-- Compute the difference between two sets (node1 - nodes2) using "by value" 
equality. -->

<xsl:function name="vset:difference" as="item( )*">
  <xsl:param name="nodes1" as="item( )*" />
  <xsl:param name="nodes2" as="item( )*" />
 <xsl:sequence select="for $test in $nodes1 return if (vset:member-of($nodes2,
 $test)) then ( ) else $test"/>  
</xsl:function>


</xsl:stylesheet>

Discussion

You might think that equality is a cut-and-dried issue; two things are either equal or they're not. However, in programming (as in politics), equality is in the eye of the beholder. In a typical document, an element is associated with a uniquely identifiable object. For example, a paragraph element, <p>...</p>, is distinct from another paragraph element somewhere else in the document, even if they have the same content. Hence, set operations based on the unique identity of elements are the norm. However, when considering XSLT operations crossing multiple documents or acting on elements that result from applying xsl:copy, we need to carefully consider what we want equality to be.

Here are some query examples in which value set semantics are required:

  1. You have two documents from different namespaces. Example 9-5 to Example 9-8 help you find all the element (local) names these documents have in common and those that are unique to each namespace.

    Example 9-5. doc1.xml
    <doc xmlns:doc1="doc1" xmlns="doc1">
      <chapter label="1">
        <section label="1">
          <p>
            Once upon a time...
          </p>
        </section>
      </chapter>
      <chapter label="2">
        <note to="editor">I am still waiting for my $100000 advance.</note>
        <section label="1">
          <p>
            ... and they lived happily ever after.
          </p>
        </section>
      </chapter>
    </doc>

    Example 9-6. doc2.xml
    <doc xmlns:doc1="doc2" xmlns="doc2">
      <chapter label="1">
        <section label="1">
          <sub>
            <p>
              Once upon a time...
              <ref type="footnote" label="1"/>
            </p>
          </sub>
          <fig>Figure1</fig>
        </section>
        <footnote label="1">
          Hey diddle diddle.
        </footnote>
      </chapter>
      <chapter label="2">
        <section label="1">
          <p>
            ... and they lived happily ever after.
          </p>
        </section>
      </chapter>
    </doc>

    Example 9-7. unique-element-names.xslt
    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
      xmlns:doc1="doc1" xmlns:doc2="doc2"
      xmlns:vset="http:/www.ora.com/XSLTCookbook/namespaces/vset"
      extension-element-prefixes="vset">
       
      <xsl:import href="set.ops.xslt"/> 
       
      <xsl:output method="text" />
      
      <xsl:template match="/">
        <xsl:text>&#xa;The elements in common are: </xsl:text>
        <xsl:call-template name="vset:intersection">
          <xsl:with-param name="nodes1" select="//*"/>
          <xsl:with-param name="nodes2" select="document('doc2.xml')//*"/>
        </xsl:call-template>  
       
        <xsl:text>&#xa;The elements only in doc1 are: </xsl:text>
        <xsl:call-template name="vset:difference">
          <xsl:with-param name="nodes1" select="//*"/>
          <xsl:with-param name="nodes2" select="document('doc2.xml')//*"/>
        </xsl:call-template>  
       
        <xsl:text>&#xa;The elements only in doc2 are: </xsl:text>
        <xsl:call-template name="vset:difference">
          <xsl:with-param name="nodes1" select="document('doc2.xml')//*"/>
          <xsl:with-param name="nodes2" select="//*"/>
        </xsl:call-template>  
        <xsl:text>&#xa;</xsl:text>
       
      </xsl:template>
      
      <xsl:template match="*" mode="vset:intersection">
         <xsl:value-of select="local-name(.)"/>
         <xsl:if test="position( ) != last( )">
           <xsl:text>, </xsl:text>
         </xsl:if>
      </xsl:template>
       
      <xsl:template match="*" mode="vset:difference">
         <xsl:value-of select="local-name(.)"/>
         <xsl:if test="position( ) != last( )">
           <xsl:text>, </xsl:text>
         </xsl:if>
      </xsl:template>
               
      <xsl:template match="doc1:* | doc2:*" mode="vset:element-equality">
       <xsl:param name="other"/>
        <xsl:if test="local-name(.) = local-name($other)">  
          <xsl:value-of select="true( )"/>
        </xsl:if>
      </xsl:template>
          
    </xsl:stylesheet>

    Example 9-8. Output
    The elements in common are: doc, chapter, section, p
    The elements only in doc1 are: note
    The elements only in doc2 are: sub, ref, fig, footnote

  2. A Visio XML document consists of master shapes, master-shape instances, and user-defined shapes with no corresponding master. You would like to extract the data for all unique shapes. For purpose of this query, two shapes are equal if either of the following are true:

    1. They both have master attributes, @Master, and these attribute values are equal.

    2. At least one lacks a master attribute, but their geometry elements, Geom, are equal. Geometry elements are equal if all attributes of all descendants of Geom are equal.

    Otherwise, they are not equal.

    This query can be implemented by taking the intersection of the set of all shapes with itself under the rules of equality stated earlier.[1]

    [1] A mathematician will tell you that the intersection of a set with itself will always yield the same set. This is true for proper sets (with no duplicates). However, here you are using an application-specific notion of equality, and the node sets typically will not be proper sets under that equality test. However, the value-set operations always produce proper sets, so this technique is a way of removing duplicates.

    You can also use the vset:union template with the nodes parameter:

    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
      xmlns:vxd="urn:schemas-microsoft-com:office:visio" 
      xmlns:vset="http:/www.ora.com/XSLTCookbook/namespaces/vset" 
      extension-element-prefixes="vset">
       
      <xsl:import href="set.ops.xslt"/> 
       
      <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
         
      <xsl:template match="/">
      <UniqueShapes>
          <xsl:call-template name="vset:intersection">
            <xsl:with-param name="nodes1" select="//vxd:Pages/*/*/vxd:Shape"/>
            <xsl:with-param name="nodes2" select="//vxd:Pages/*/*/vxd:Shape"/>
          </xsl:call-template> 
        </UniqueShapes>
      </xsl:template>
      
      <xsl:template match="vxd:Shape" mode="vset:intersection">
        <xsl:copy-of select="." />
      </xsl:template>
       
    <xsl:template match="vxd:Shape" mode="vset:element-equality">
      <xsl:param name="other"/>
      <xsl:choose>
        <xsl:when test="@Master and $other/@Master and @Master = $other/@Master">
          <xsl:value-of select="true( )"/>
        </xsl:when>
        <xsl:when test="not(@Master) or not($other/@Master)">
          <xsl:variable name="geom1">
            <xsl:for-each select="vxd:Geom//*/@*">
              <xsl:sort select="name( )"/>
              <xsl:value-of select="."/>
            </xsl:for-each>
         </xsl:variable> 
          <xsl:variable name="geom2">
            <xsl:for-each select="$other/vxd:Geom//*/@*">
              <xsl:sort select="name( )"/>
              <xsl:value-of select="."/>
            </xsl:for-each>
         </xsl:variable> 
          <xsl:if test="$geom1 = $geom2">
            <xsl:value-of select="true( )"/>
          </xsl:if>
        </xsl:when>
      </xsl:choose>
    </xsl:template>
     
     
    </xsl:stylesheet>


Previous Page Next Page