opentag.com
\\ Technologies :: Formats :: SRX

The Segmentation Rules eXchange format is an XML format that allows you to describe rules to use for segmenting text. The rules are based on regular expressions.

To try out SRX and test rules, you can use Ratel. It is a free cross-platform SRX editor written in Java.

Screen shot of Ratel:

Example of an SRX v1.0 file:

<?xml version="1.0"?>
<srx version="1.0">
 <header segmentsubflows="yes">
  <formathandle type="start" include="no"/>
  <formathandle type="end" include="yes"/>
  <formathandle type="isolated" include="yes"/>
 </header>
 <body>
  <languagerules>
   <languagerule languagerulename="Default">
    <rule break="no">
     <beforebreak>^\s*[0-9]+\.</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="no">
     <beforebreak>[Ee][Tt][Cc]\.</beforebreak>
     <afterbreak>\s[a-z]</afterbreak>
    </rule>
    <rule break="no">
     <beforebreak>\sMr\.</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak>[\.\?!]+</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak></beforebreak>
     <afterbreak>\n</afterbreak>
    </rule>
   </languagerule>
   <languagerule languagerulename="Japanese">
    <rule break="no">
     <beforebreak>^\s*[0-9]+\.</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="no">
     <beforebreak>[Ee][Tt][Cc]\.</beforebreak>
     <afterbreak></afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak>[\.\?!]+</beforebreak>
     <afterbreak>\s</afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
     <afterbreak></afterbreak>
    </rule>
    <rule break="yes">
     <beforebreak></beforebreak>
     <afterbreak>\n</afterbreak>
    </rule>
   </languagerule>
  </languagerules>
  <maprules>
   <maprule maprulename="Default">
    <languagemap languagepattern="JA.*" languagerulename="Japanese"/>
    <languagemap languagepattern=".*" languagerulename="Default"/>
   </maprule>
  </maprules>
 </body>
</srx>

For more information on SRX see the GALA Web pages on this topic.