Fix Capitalization of English Proper Nouns

sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager
edited December 2018 in Knowledge Base
This is just a quick handy tool that I have used countless times - it fixes the text in a nominal attribute for English proper nouns.

For example, the name "SCOTT" or "scott" gets converted to "Scott". Not exciting, but handy in text mining.

Here's the process if you want to look at it. It's also on the Community Repository for easy access as a building block.

</code><?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="82" name="Subprocess (2)" width="90" x="45" y="34">
        <process expanded="true">
          <operator activated="true" class="generate_data_user_specification" compatibility="8.1.003" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
            <list key="attribute_values">
              <parameter key="name" value="&quot;SCOTT&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="8.1.003" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
            <list key="attribute_values">
              <parameter key="name" value="&quot;genzer&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="append" compatibility="8.1.003" expanded="true" height="103" name="Append" width="90" x="179" y="34"/>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
          <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Append" from_port="merged set" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">sample data</description>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="8.1.003" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
        <list key="function_descriptions">
          <parameter key="properNoun" value="concat(upper(prefix(name,1)),lower(suffix(name,length(name)-1)))"/>
        </list>
        <description align="center" color="transparent" colored="false" width="126">fix capitalization of English nouns to proper nouns</description>
      </operator>
      <connect from_op="Subprocess (2)" from_port="out 1" to_op="Generate Attributes" to_port="example set input"/>
      <connect from_op="Generate Attributes" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="252"/>
    </process>
  </operator>
</process></span></pre></div><br>And here's another one if your text is coming from a document:<br><br><div class="Spoiler"><pre class="CodeBlock"><code><?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" breakpoints="after" class="text:create_document" compatibility="8.1.000" expanded="true" height="68" name="Create Document" width="90" x="45" y="34">
        <parameter key="text" value="SCOTT"/>
        <description align="center" color="transparent" colored="false" width="126">sample document</description>
      </operator>
      <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="82" name="Subprocess (2)" width="90" x="179" y="34">
        <process expanded="true">
          <operator activated="true" class="text:documents_to_data" compatibility="8.1.000" expanded="true" height="82" name="Documents to Data" width="90" x="45" y="34">
            <parameter key="text_attribute" value="text"/>
            <parameter key="add_meta_information" value="false"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="8.1.003" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
            <list key="function_descriptions">
              <parameter key="text" value="concat(upper(prefix(text,1)),lower(suffix(text,length(text)-1)))"/>
            </list>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="8.1.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="text"/>
          </operator>
          <operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="34">
            <list key="specify_weights"/>
          </operator>
          <operator activated="true" class="text:combine_documents" compatibility="8.1.000" expanded="true" height="82" name="Combine Documents" width="90" x="581" y="34"/>
          <connect from_port="in 1" to_op="Documents to Data" to_port="documents 1"/>
          <connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
          <connect from_op="Data to Documents" from_port="documents" to_op="Combine Documents" to_port="documents 1"/>
          <connect from_op="Combine Documents" from_port="document" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">fix capitalization of English nouns to proper nouns</description>
      </operator>
      <connect from_op="Create Document" from_port="output" to_op="Subprocess (2)" to_port="in 1"/>
      <connect from_op="Subprocess (2)" from_port="out 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="252"/>
    </process>
  </operator>
</process>

Scott
Sign In or Register to comment.