将复数名词转换为单数
Convert a plural noun to singular
这是使用 SimpleNLG 完成的 Java API
我想将 "elves" 转换为小精灵。下面的代码是单数转复数,怎么修改才能复数转单数?
final XMLLexicon xmlLexicon = new XMLLexicon();
final WordElement word = xmlLexicon.getWord("elves", LexicalCategory.NOUN);
final InflectedWordElement pluralWord = new InflectedWordElement(word);
pluralWord.setPlural(true);
final Realiser realiser = new Realiser(xmlLexicon);
System.out.println(realiser.realise(pluralWord));
这个 API 中显然没有 setSingular()
方法(我真的很依赖那个方法,我觉得这有点有趣,没有一个方法可以处理这样的事情。)另外从 V4 开始,也没有 setPlural()
方法。
[1] Note that in SimpleNLG V4, there are no lexicon methods to
directly get inflected variants of a word; in other words, there is no
equivalent in V4 of the SimpleNLG V3 getPlural(), getPastParticiple(),
etc. methods. It is possible in V4 to compute inflected variants of
words, but the process is more complicated: basically we need to
create an InflectedWordElement around the base form, add appropriate
features to this InflectedWordElement, and then realise it.
我认为这可能会奏效:(我没有测试它,因为我现在没有时间。)
final XMLLexicon xmlLexicon = new XMLLexicon();
final WordElement word = xmlLexicon.getWord("elves", LexicalCategory.NOUN);
final InflectedWordElement singularWord = new InflectedWordElement(word);
WordElement sw = singularWord.getBaseWord();
final Realiser realiser = new Realiser(xmlLexicon);
System.out.println(realiser.realise(sw));
如果这对您或其他任何人都不起作用,欢迎您here(docs) and here(tutorial)寻找答案。
This code 帮助了我:
/*
* JBoss DNA (http://www.jboss.org/dna)
* See the COPYRIGHT.txt file distributed with this work for information
* regarding copyright ownership. Some portions may be licensed
* to Red Hat, Inc. under one or more contributor license agreements.
* See the AUTHORS.txt file in the distribution for a full listing of
* individual contributors.
*
* JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
* is licensed to you under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* JBoss DNA is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Transforms words to singular, plural, humanized (human readable), underscore, camel case, or ordinal form. This is inspired by
* the <a href="http://api.rubyonrails.org/classes/Inflector.html">Inflector</a> class in <a
* href="http://www.rubyonrails.org">Ruby on Rails</a>, which is distributed under the <a
* href="http://wiki.rubyonrails.org/rails/pages/License">Rails license</a>.
*
* @author Randall Hauch
*/
public class Inflector {
protected static final Inflector INSTANCE = new Inflector();
public static final Inflector getInstance() {
return INSTANCE;
}
protected class Rule {
protected final String expression;
protected final Pattern expressionPattern;
protected final String replacement;
protected Rule( String expression,
String replacement ) {
this.expression = expression;
this.replacement = replacement != null ? replacement : "";
this.expressionPattern = Pattern.compile(this.expression, Pattern.CASE_INSENSITIVE);
}
/**
* Apply the rule against the input string, returning the modified string or null if the rule didn't apply (and no
* modifications were made)
*
* @param input the input string
* @return the modified string if this rule applied, or null if the input was not modified by this rule
*/
protected String apply( String input ) {
Matcher matcher = this.expressionPattern.matcher(input);
if (!matcher.find()) return null;
return matcher.replaceAll(this.replacement);
}
@Override
public int hashCode() {
return expression.hashCode();
}
@Override
public boolean equals( Object obj ) {
if (obj == this) return true;
if (obj != null && obj.getClass() == this.getClass()) {
final Rule that = (Rule)obj;
if (this.expression.equalsIgnoreCase(that.expression)) return true;
}
return false;
}
@Override
public String toString() {
return expression + ", " + replacement;
}
}
private LinkedList<Rule> plurals = new LinkedList<Rule>();
private LinkedList<Rule> singulars = new LinkedList<Rule>();
/**
* The lowercase words that are to be excluded and not processed. This map can be modified by the users via
* {@link #getUncountables()}.
*/
private final Set<String> uncountables = new HashSet<String>();
public Inflector() {
initialize();
}
protected Inflector( Inflector original ) {
this.plurals.addAll(original.plurals);
this.singulars.addAll(original.singulars);
this.uncountables.addAll(original.uncountables);
}
@Override
public Inflector clone() {
return new Inflector(this);
}
// ------------------------------------------------------------------------------------------------
// Usage functions
// ------------------------------------------------------------------------------------------------
/**
* Returns the plural form of the word in the string.
*
* Examples:
*
* <pre>
* inflector.pluralize("post") #=> "posts"
* inflector.pluralize("octopus") #=> "octopi"
* inflector.pluralize("sheep") #=> "sheep"
* inflector.pluralize("words") #=> "words"
* inflector.pluralize("the blue mailman") #=> "the blue mailmen"
* inflector.pluralize("CamelOctopus") #=> "CamelOctopi"
* </pre>
*
*
*
* Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
*
*
* @param word the word that is to be pluralized.
* @return the pluralized form of the word, or the word itself if it could not be pluralized
* @see #singularize(Object)
*/
public String pluralize( Object word ) {
if (word == null) return null;
String wordStr = word.toString().trim();
if (wordStr.length() == 0) return wordStr;
if (isUncountable(wordStr)) return wordStr;
for (Rule rule : this.plurals) {
String result = rule.apply(wordStr);
if (result != null) return result;
}
return wordStr;
}
public String pluralize( Object word,
int count ) {
if (word == null) return null;
if (count == 1 || count == -1) {
return word.toString();
}
return pluralize(word);
}
/**
* Returns the singular form of the word in the string.
*
* Examples:
*
* <pre>
* inflector.singularize("posts") #=> "post"
* inflector.singularize("octopi") #=> "octopus"
* inflector.singularize("sheep") #=> "sheep"
* inflector.singularize("words") #=> "word"
* inflector.singularize("the blue mailmen") #=> "the blue mailman"
* inflector.singularize("CamelOctopi") #=> "CamelOctopus"
* </pre>
*
*
*
* Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
*
*
* @param word the word that is to be pluralized.
* @return the pluralized form of the word, or the word itself if it could not be pluralized
* @see #pluralize(Object)
*/
public String singularize( Object word ) {
if (word == null) return null;
String wordStr = word.toString().trim();
if (wordStr.length() == 0) return wordStr;
if (isUncountable(wordStr)) return wordStr;
for (Rule rule : this.singulars) {
String result = rule.apply(wordStr);
if (result != null) return result;
}
return wordStr;
}
/**
* Converts strings to lowerCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
*
* Examples:
*
* <pre>
* inflector.lowerCamelCase("active_record") #=> "activeRecord"
* inflector.lowerCamelCase("first_name") #=> "firstName"
* inflector.lowerCamelCase("name") #=> "name"
* inflector.lowerCamelCase("the-first_name",'-') #=> "theFirstName"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
* @param delimiterChars optional characters that are used to delimit word boundaries
* @return the lower camel case version of the word
* @see #underscore(String, char[])
* @see #camelCase(String, boolean, char[])
* @see #upperCamelCase(String, char[])
*/
public String lowerCamelCase( String lowerCaseAndUnderscoredWord,
char... delimiterChars ) {
return camelCase(lowerCaseAndUnderscoredWord, false, delimiterChars);
}
/**
* Converts strings to UpperCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
*
* Examples:
*
* <pre>
* inflector.upperCamelCase("active_record") #=> "SctiveRecord"
* inflector.upperCamelCase("first_name") #=> "FirstName"
* inflector.upperCamelCase("name") #=> "Name"
* inflector.lowerCamelCase("the-first_name",'-') #=> "TheFirstName"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
* @param delimiterChars optional characters that are used to delimit word boundaries
* @return the upper camel case version of the word
* @see #underscore(String, char[])
* @see #camelCase(String, boolean, char[])
* @see #lowerCamelCase(String, char[])
*/
public String upperCamelCase( String lowerCaseAndUnderscoredWord,
char... delimiterChars ) {
return camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars);
}
/**
* By default, this method converts strings to UpperCamelCase. If the <code>uppercaseFirstLetter</code> argument to false,
* then this method produces lowerCamelCase. This method will also use any extra delimiter characters to identify word
* boundaries.
*
* Examples:
*
* <pre>
* inflector.camelCase("active_record",false) #=> "activeRecord"
* inflector.camelCase("active_record",true) #=> "ActiveRecord"
* inflector.camelCase("first_name",false) #=> "firstName"
* inflector.camelCase("first_name",true) #=> "FirstName"
* inflector.camelCase("name",false) #=> "name"
* inflector.camelCase("name",true) #=> "Name"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
* @param uppercaseFirstLetter true if the first character is to be uppercased, or false if the first character is to be
* lowercased
* @param delimiterChars optional characters that are used to delimit word boundaries
* @return the camel case version of the word
* @see #underscore(String, char[])
* @see #upperCamelCase(String, char[])
* @see #lowerCamelCase(String, char[])
*/
public String camelCase( String lowerCaseAndUnderscoredWord,
boolean uppercaseFirstLetter,
char... delimiterChars ) {
if (lowerCaseAndUnderscoredWord == null) return null;
lowerCaseAndUnderscoredWord = lowerCaseAndUnderscoredWord.trim();
if (lowerCaseAndUnderscoredWord.length() == 0) return "";
if (uppercaseFirstLetter) {
String result = lowerCaseAndUnderscoredWord;
// Replace any extra delimiters with underscores (before the underscores are converted in the next step)...
if (delimiterChars != null) {
for (char delimiterChar : delimiterChars) {
result = result.replace(delimiterChar, '_');
}
}
// Change the case at the beginning at after each underscore ...
return replaceAllWithUppercase(result, "(^|_)(.)", 2);
}
if (lowerCaseAndUnderscoredWord.length() < 2) return lowerCaseAndUnderscoredWord;
return "" + Character.toLowerCase(lowerCaseAndUnderscoredWord.charAt(0))
+ camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars).substring(1);
}
/**
* Makes an underscored form from the expression in the string (the reverse of the {@link #camelCase(String, boolean, char[])
* camelCase} method. Also changes any characters that match the supplied delimiters into underscore.
*
* Examples:
*
* <pre>
* inflector.underscore("activeRecord") #=> "active_record"
* inflector.underscore("ActiveRecord") #=> "active_record"
* inflector.underscore("firstName") #=> "first_name"
* inflector.underscore("FirstName") #=> "first_name"
* inflector.underscore("name") #=> "name"
* inflector.underscore("The.firstName") #=> "the_first_name"
* </pre>
*
*
*
* @param camelCaseWord the camel-cased word that is to be converted;
* @param delimiterChars optional characters that are used to delimit word boundaries (beyond capitalization)
* @return a lower-cased version of the input, with separate words delimited by the underscore character.
*/
public String underscore( String camelCaseWord,
char... delimiterChars ) {
if (camelCaseWord == null) return null;
String result = camelCaseWord.trim();
if (result.length() == 0) return "";
result = result.replaceAll("([A-Z]+)([A-Z][a-z])", "_");
result = result.replaceAll("([a-z\d])([A-Z])", "_");
result = result.replace('-', '_');
if (delimiterChars != null) {
for (char delimiterChar : delimiterChars) {
result = result.replace(delimiterChar, '_');
}
}
return result.toLowerCase();
}
/**
* Returns a copy of the input with the first character converted to uppercase and the remainder to lowercase.
*
* @param words the word to be capitalized
* @return the string with the first character capitalized and the remaining characters lowercased
*/
public String capitalize( String words ) {
if (words == null) return null;
String result = words.trim();
if (result.length() == 0) return "";
if (result.length() == 1) return result.toUpperCase();
return "" + Character.toUpperCase(result.charAt(0)) + result.substring(1).toLowerCase();
}
/**
* Capitalizes the first word and turns underscores into spaces and strips trailing "_id" and any supplied removable tokens.
* Like {@link #titleCase(String, String[])}, this is meant for creating pretty output.
*
* Examples:
*
* <pre>
* inflector.humanize("employee_salary") #=> "Employee salary"
* inflector.humanize("author_id") #=> "Author"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWords the input to be humanized
* @param removableTokens optional array of tokens that are to be removed
* @return the humanized string
* @see #titleCase(String, String[])
*/
public String humanize( String lowerCaseAndUnderscoredWords,
String... removableTokens ) {
if (lowerCaseAndUnderscoredWords == null) return null;
String result = lowerCaseAndUnderscoredWords.trim();
if (result.length() == 0) return "";
// Remove a trailing "_id" token
result = result.replaceAll("_id$", "");
// Remove all of the tokens that should be removed
if (removableTokens != null) {
for (String removableToken : removableTokens) {
result = result.replaceAll(removableToken, "");
}
}
result = result.replaceAll("_+", " "); // replace all adjacent underscores with a single space
return capitalize(result);
}
/**
* Capitalizes all the words and replaces some characters in the string to create a nicer looking title. Underscores are
* changed to spaces, a trailing "_id" is removed, and any of the supplied tokens are removed. Like
* {@link #humanize(String, String[])}, this is meant for creating pretty output.
*
* Examples:
*
* <pre>
* inflector.titleCase("man from the boondocks") #=> "Man From The Boondocks"
* inflector.titleCase("x-men: the last stand") #=> "X Men: The Last Stand"
* </pre>
*
*
*
* @param words the input to be turned into title case
* @param removableTokens optional array of tokens that are to be removed
* @return the title-case version of the supplied words
*/
public String titleCase( String words,
String... removableTokens ) {
String result = humanize(words, removableTokens);
result = replaceAllWithUppercase(result, "\b([a-z])", 1); // change first char of each word to uppercase
return result;
}
/**
* Turns a non-negative number into an ordinal string used to denote the position in an ordered sequence, such as 1st, 2nd,
* 3rd, 4th.
*
* @param number the non-negative number
* @return the string with the number and ordinal suffix
*/
public String ordinalize( int number ) {
int remainder = number % 100;
String numberStr = Integer.toString(number);
if (11 <= number && number <= 13) return numberStr + "th";
remainder = number % 10;
if (remainder == 1) return numberStr + "st";
if (remainder == 2) return numberStr + "nd";
if (remainder == 3) return numberStr + "rd";
return numberStr + "th";
}
// ------------------------------------------------------------------------------------------------
// Management methods
// ------------------------------------------------------------------------------------------------
/**
* Determine whether the supplied word is considered uncountable by the {@link #pluralize(Object) pluralize} and
* {@link #singularize(Object) singularize} methods.
*
* @param word the word
* @return true if the plural and singular forms of the word are the same
*/
public boolean isUncountable( String word ) {
if (word == null) return false;
String trimmedLower = word.trim().toLowerCase();
return this.uncountables.contains(trimmedLower);
}
/**
* Get the set of words that are not processed by the Inflector. The resulting map is directly modifiable.
*
* @return the set of uncountable words
*/
public Set<String> getUncountables() {
return uncountables;
}
public void addPluralize( String rule,
String replacement ) {
final Rule pluralizeRule = new Rule(rule, replacement);
this.plurals.addFirst(pluralizeRule);
}
public void addSingularize( String rule,
String replacement ) {
final Rule singularizeRule = new Rule(rule, replacement);
this.singulars.addFirst(singularizeRule);
}
public void addIrregular( String singular,
String plural ) {
//CheckArg.isNotEmpty(singular, "singular rule");
//CheckArg.isNotEmpty(plural, "plural rule");
String singularRemainder = singular.length() > 1 ? singular.substring(1) : "";
String pluralRemainder = plural.length() > 1 ? plural.substring(1) : "";
addPluralize("(" + singular.charAt(0) + ")" + singularRemainder + "$", "" + pluralRemainder);
addSingularize("(" + plural.charAt(0) + ")" + pluralRemainder + "$", "" + singularRemainder);
}
public void addUncountable( String... words ) {
if (words == null || words.length == 0) return;
for (String word : words) {
if (word != null) uncountables.add(word.trim().toLowerCase());
}
}
/**
* Utility method to replace all occurrences given by the specific backreference with its uppercased form, and remove all
* other backreferences.
*
* The Java {@link Pattern regular expression processing} does not use the preprocessing directives <code>\l</code>,
* <code>\u</code>, <code>\L</code>, and <code>\U</code>. If so, such directives could be used in the replacement string
* to uppercase or lowercase the backreferences. For example, <code>\L1</code> would lowercase the first backreference, and
* <code>\u3</code> would uppercase the 3rd backreference.
*
*
* @param input
* @param regex
* @param groupNumberToUppercase
* @return the input string with the appropriate characters converted to upper-case
*/
protected static String replaceAllWithUppercase( String input,
String regex,
int groupNumberToUppercase ) {
Pattern underscoreAndDotPattern = Pattern.compile(regex);
Matcher matcher = underscoreAndDotPattern.matcher(input);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, matcher.group(groupNumberToUppercase).toUpperCase());
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* Completely remove all rules within this inflector.
*/
public void clear() {
this.uncountables.clear();
this.plurals.clear();
this.singulars.clear();
}
protected void initialize() {
Inflector inflect = this;
inflect.addPluralize("$", "s");
inflect.addPluralize("s$", "s");
inflect.addPluralize("(ax|test)is$", "es");
inflect.addPluralize("(octop|vir)us$", "i");
inflect.addPluralize("(octop|vir)i$", "i"); // already plural
inflect.addPluralize("(alias|status)$", "es");
inflect.addPluralize("(bu)s$", "ses");
inflect.addPluralize("(buffal|tomat)o$", "oes");
inflect.addPluralize("([ti])um$", "a");
inflect.addPluralize("([ti])a$", "a"); // already plural
inflect.addPluralize("sis$", "ses");
inflect.addPluralize("(?:([^f])fe|([lr])f)$", "ves");
inflect.addPluralize("(hive)$", "s");
inflect.addPluralize("([^aeiouy]|qu)y$", "ies");
inflect.addPluralize("(x|ch|ss|sh)$", "es");
inflect.addPluralize("(matr|vert|ind)ix|ex$", "ices");
inflect.addPluralize("([m|l])ouse$", "ice");
inflect.addPluralize("([m|l])ice$", "ice");
inflect.addPluralize("^(ox)$", "en");
inflect.addPluralize("(quiz)$", "zes");
// Need to check for the following words that are already pluralized:
inflect.addPluralize("(people|men|children|sexes|moves|stadiums)$", ""); // irregulars
inflect.addPluralize("(oxen|octopi|viri|aliases|quizzes)$", ""); // special rules
inflect.addSingularize("s$", "");
inflect.addSingularize("(s|si|u)s$", "s"); // '-us' and '-ss' are already singular
inflect.addSingularize("(n)ews$", "ews");
inflect.addSingularize("([ti])a$", "um");
inflect.addSingularize("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "sis");
inflect.addSingularize("(^analy)ses$", "sis");
inflect.addSingularize("(^analy)sis$", "sis"); // already singular, but ends in 's'
inflect.addSingularize("([^f])ves$", "fe");
inflect.addSingularize("(hive)s$", "");
inflect.addSingularize("(tive)s$", "");
inflect.addSingularize("([lr])ves$", "f");
inflect.addSingularize("([^aeiouy]|qu)ies$", "y");
inflect.addSingularize("(s)eries$", "eries");
inflect.addSingularize("(m)ovies$", "ovie");
inflect.addSingularize("(x|ch|ss|sh)es$", "");
inflect.addSingularize("([m|l])ice$", "ouse");
inflect.addSingularize("(bus)es$", "");
inflect.addSingularize("(o)es$", "");
inflect.addSingularize("(shoe)s$", "");
inflect.addSingularize("(cris|ax|test)is$", "is"); // already singular, but ends in 's'
inflect.addSingularize("(cris|ax|test)es$", "is");
inflect.addSingularize("(octop|vir)i$", "us");
inflect.addSingularize("(octop|vir)us$", "us"); // already singular, but ends in 's'
inflect.addSingularize("(alias|status)es$", "");
inflect.addSingularize("(alias|status)$", ""); // already singular, but ends in 's'
inflect.addSingularize("^(ox)en", "");
inflect.addSingularize("(vert|ind)ices$", "ex");
inflect.addSingularize("(matr)ices$", "ix");
inflect.addSingularize("(quiz)zes$", "");
inflect.addIrregular("person", "people");
inflect.addIrregular("man", "men");
inflect.addIrregular("child", "children");
inflect.addIrregular("sex", "sexes");
inflect.addIrregular("move", "moves");
inflect.addIrregular("stadium", "stadiums");
inflect.addUncountable("equipment", "information", "rice", "money", "species", "series", "fish", "sheep");
}
}
这是使用 SimpleNLG 完成的 Java API
我想将 "elves" 转换为小精灵。下面的代码是单数转复数,怎么修改才能复数转单数?
final XMLLexicon xmlLexicon = new XMLLexicon();
final WordElement word = xmlLexicon.getWord("elves", LexicalCategory.NOUN);
final InflectedWordElement pluralWord = new InflectedWordElement(word);
pluralWord.setPlural(true);
final Realiser realiser = new Realiser(xmlLexicon);
System.out.println(realiser.realise(pluralWord));
这个 API 中显然没有 setSingular()
方法(我真的很依赖那个方法,我觉得这有点有趣,没有一个方法可以处理这样的事情。)另外从 V4 开始,也没有 setPlural()
方法。
[1] Note that in SimpleNLG V4, there are no lexicon methods to directly get inflected variants of a word; in other words, there is no equivalent in V4 of the SimpleNLG V3 getPlural(), getPastParticiple(), etc. methods. It is possible in V4 to compute inflected variants of words, but the process is more complicated: basically we need to create an InflectedWordElement around the base form, add appropriate features to this InflectedWordElement, and then realise it.
我认为这可能会奏效:(我没有测试它,因为我现在没有时间。)
final XMLLexicon xmlLexicon = new XMLLexicon();
final WordElement word = xmlLexicon.getWord("elves", LexicalCategory.NOUN);
final InflectedWordElement singularWord = new InflectedWordElement(word);
WordElement sw = singularWord.getBaseWord();
final Realiser realiser = new Realiser(xmlLexicon);
System.out.println(realiser.realise(sw));
如果这对您或其他任何人都不起作用,欢迎您here(docs) and here(tutorial)寻找答案。
This code 帮助了我:
/*
* JBoss DNA (http://www.jboss.org/dna)
* See the COPYRIGHT.txt file distributed with this work for information
* regarding copyright ownership. Some portions may be licensed
* to Red Hat, Inc. under one or more contributor license agreements.
* See the AUTHORS.txt file in the distribution for a full listing of
* individual contributors.
*
* JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
* is licensed to you under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* JBoss DNA is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Transforms words to singular, plural, humanized (human readable), underscore, camel case, or ordinal form. This is inspired by
* the <a href="http://api.rubyonrails.org/classes/Inflector.html">Inflector</a> class in <a
* href="http://www.rubyonrails.org">Ruby on Rails</a>, which is distributed under the <a
* href="http://wiki.rubyonrails.org/rails/pages/License">Rails license</a>.
*
* @author Randall Hauch
*/
public class Inflector {
protected static final Inflector INSTANCE = new Inflector();
public static final Inflector getInstance() {
return INSTANCE;
}
protected class Rule {
protected final String expression;
protected final Pattern expressionPattern;
protected final String replacement;
protected Rule( String expression,
String replacement ) {
this.expression = expression;
this.replacement = replacement != null ? replacement : "";
this.expressionPattern = Pattern.compile(this.expression, Pattern.CASE_INSENSITIVE);
}
/**
* Apply the rule against the input string, returning the modified string or null if the rule didn't apply (and no
* modifications were made)
*
* @param input the input string
* @return the modified string if this rule applied, or null if the input was not modified by this rule
*/
protected String apply( String input ) {
Matcher matcher = this.expressionPattern.matcher(input);
if (!matcher.find()) return null;
return matcher.replaceAll(this.replacement);
}
@Override
public int hashCode() {
return expression.hashCode();
}
@Override
public boolean equals( Object obj ) {
if (obj == this) return true;
if (obj != null && obj.getClass() == this.getClass()) {
final Rule that = (Rule)obj;
if (this.expression.equalsIgnoreCase(that.expression)) return true;
}
return false;
}
@Override
public String toString() {
return expression + ", " + replacement;
}
}
private LinkedList<Rule> plurals = new LinkedList<Rule>();
private LinkedList<Rule> singulars = new LinkedList<Rule>();
/**
* The lowercase words that are to be excluded and not processed. This map can be modified by the users via
* {@link #getUncountables()}.
*/
private final Set<String> uncountables = new HashSet<String>();
public Inflector() {
initialize();
}
protected Inflector( Inflector original ) {
this.plurals.addAll(original.plurals);
this.singulars.addAll(original.singulars);
this.uncountables.addAll(original.uncountables);
}
@Override
public Inflector clone() {
return new Inflector(this);
}
// ------------------------------------------------------------------------------------------------
// Usage functions
// ------------------------------------------------------------------------------------------------
/**
* Returns the plural form of the word in the string.
*
* Examples:
*
* <pre>
* inflector.pluralize("post") #=> "posts"
* inflector.pluralize("octopus") #=> "octopi"
* inflector.pluralize("sheep") #=> "sheep"
* inflector.pluralize("words") #=> "words"
* inflector.pluralize("the blue mailman") #=> "the blue mailmen"
* inflector.pluralize("CamelOctopus") #=> "CamelOctopi"
* </pre>
*
*
*
* Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
*
*
* @param word the word that is to be pluralized.
* @return the pluralized form of the word, or the word itself if it could not be pluralized
* @see #singularize(Object)
*/
public String pluralize( Object word ) {
if (word == null) return null;
String wordStr = word.toString().trim();
if (wordStr.length() == 0) return wordStr;
if (isUncountable(wordStr)) return wordStr;
for (Rule rule : this.plurals) {
String result = rule.apply(wordStr);
if (result != null) return result;
}
return wordStr;
}
public String pluralize( Object word,
int count ) {
if (word == null) return null;
if (count == 1 || count == -1) {
return word.toString();
}
return pluralize(word);
}
/**
* Returns the singular form of the word in the string.
*
* Examples:
*
* <pre>
* inflector.singularize("posts") #=> "post"
* inflector.singularize("octopi") #=> "octopus"
* inflector.singularize("sheep") #=> "sheep"
* inflector.singularize("words") #=> "word"
* inflector.singularize("the blue mailmen") #=> "the blue mailman"
* inflector.singularize("CamelOctopi") #=> "CamelOctopus"
* </pre>
*
*
*
* Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
*
*
* @param word the word that is to be pluralized.
* @return the pluralized form of the word, or the word itself if it could not be pluralized
* @see #pluralize(Object)
*/
public String singularize( Object word ) {
if (word == null) return null;
String wordStr = word.toString().trim();
if (wordStr.length() == 0) return wordStr;
if (isUncountable(wordStr)) return wordStr;
for (Rule rule : this.singulars) {
String result = rule.apply(wordStr);
if (result != null) return result;
}
return wordStr;
}
/**
* Converts strings to lowerCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
*
* Examples:
*
* <pre>
* inflector.lowerCamelCase("active_record") #=> "activeRecord"
* inflector.lowerCamelCase("first_name") #=> "firstName"
* inflector.lowerCamelCase("name") #=> "name"
* inflector.lowerCamelCase("the-first_name",'-') #=> "theFirstName"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
* @param delimiterChars optional characters that are used to delimit word boundaries
* @return the lower camel case version of the word
* @see #underscore(String, char[])
* @see #camelCase(String, boolean, char[])
* @see #upperCamelCase(String, char[])
*/
public String lowerCamelCase( String lowerCaseAndUnderscoredWord,
char... delimiterChars ) {
return camelCase(lowerCaseAndUnderscoredWord, false, delimiterChars);
}
/**
* Converts strings to UpperCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
*
* Examples:
*
* <pre>
* inflector.upperCamelCase("active_record") #=> "SctiveRecord"
* inflector.upperCamelCase("first_name") #=> "FirstName"
* inflector.upperCamelCase("name") #=> "Name"
* inflector.lowerCamelCase("the-first_name",'-') #=> "TheFirstName"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
* @param delimiterChars optional characters that are used to delimit word boundaries
* @return the upper camel case version of the word
* @see #underscore(String, char[])
* @see #camelCase(String, boolean, char[])
* @see #lowerCamelCase(String, char[])
*/
public String upperCamelCase( String lowerCaseAndUnderscoredWord,
char... delimiterChars ) {
return camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars);
}
/**
* By default, this method converts strings to UpperCamelCase. If the <code>uppercaseFirstLetter</code> argument to false,
* then this method produces lowerCamelCase. This method will also use any extra delimiter characters to identify word
* boundaries.
*
* Examples:
*
* <pre>
* inflector.camelCase("active_record",false) #=> "activeRecord"
* inflector.camelCase("active_record",true) #=> "ActiveRecord"
* inflector.camelCase("first_name",false) #=> "firstName"
* inflector.camelCase("first_name",true) #=> "FirstName"
* inflector.camelCase("name",false) #=> "name"
* inflector.camelCase("name",true) #=> "Name"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
* @param uppercaseFirstLetter true if the first character is to be uppercased, or false if the first character is to be
* lowercased
* @param delimiterChars optional characters that are used to delimit word boundaries
* @return the camel case version of the word
* @see #underscore(String, char[])
* @see #upperCamelCase(String, char[])
* @see #lowerCamelCase(String, char[])
*/
public String camelCase( String lowerCaseAndUnderscoredWord,
boolean uppercaseFirstLetter,
char... delimiterChars ) {
if (lowerCaseAndUnderscoredWord == null) return null;
lowerCaseAndUnderscoredWord = lowerCaseAndUnderscoredWord.trim();
if (lowerCaseAndUnderscoredWord.length() == 0) return "";
if (uppercaseFirstLetter) {
String result = lowerCaseAndUnderscoredWord;
// Replace any extra delimiters with underscores (before the underscores are converted in the next step)...
if (delimiterChars != null) {
for (char delimiterChar : delimiterChars) {
result = result.replace(delimiterChar, '_');
}
}
// Change the case at the beginning at after each underscore ...
return replaceAllWithUppercase(result, "(^|_)(.)", 2);
}
if (lowerCaseAndUnderscoredWord.length() < 2) return lowerCaseAndUnderscoredWord;
return "" + Character.toLowerCase(lowerCaseAndUnderscoredWord.charAt(0))
+ camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars).substring(1);
}
/**
* Makes an underscored form from the expression in the string (the reverse of the {@link #camelCase(String, boolean, char[])
* camelCase} method. Also changes any characters that match the supplied delimiters into underscore.
*
* Examples:
*
* <pre>
* inflector.underscore("activeRecord") #=> "active_record"
* inflector.underscore("ActiveRecord") #=> "active_record"
* inflector.underscore("firstName") #=> "first_name"
* inflector.underscore("FirstName") #=> "first_name"
* inflector.underscore("name") #=> "name"
* inflector.underscore("The.firstName") #=> "the_first_name"
* </pre>
*
*
*
* @param camelCaseWord the camel-cased word that is to be converted;
* @param delimiterChars optional characters that are used to delimit word boundaries (beyond capitalization)
* @return a lower-cased version of the input, with separate words delimited by the underscore character.
*/
public String underscore( String camelCaseWord,
char... delimiterChars ) {
if (camelCaseWord == null) return null;
String result = camelCaseWord.trim();
if (result.length() == 0) return "";
result = result.replaceAll("([A-Z]+)([A-Z][a-z])", "_");
result = result.replaceAll("([a-z\d])([A-Z])", "_");
result = result.replace('-', '_');
if (delimiterChars != null) {
for (char delimiterChar : delimiterChars) {
result = result.replace(delimiterChar, '_');
}
}
return result.toLowerCase();
}
/**
* Returns a copy of the input with the first character converted to uppercase and the remainder to lowercase.
*
* @param words the word to be capitalized
* @return the string with the first character capitalized and the remaining characters lowercased
*/
public String capitalize( String words ) {
if (words == null) return null;
String result = words.trim();
if (result.length() == 0) return "";
if (result.length() == 1) return result.toUpperCase();
return "" + Character.toUpperCase(result.charAt(0)) + result.substring(1).toLowerCase();
}
/**
* Capitalizes the first word and turns underscores into spaces and strips trailing "_id" and any supplied removable tokens.
* Like {@link #titleCase(String, String[])}, this is meant for creating pretty output.
*
* Examples:
*
* <pre>
* inflector.humanize("employee_salary") #=> "Employee salary"
* inflector.humanize("author_id") #=> "Author"
* </pre>
*
*
*
* @param lowerCaseAndUnderscoredWords the input to be humanized
* @param removableTokens optional array of tokens that are to be removed
* @return the humanized string
* @see #titleCase(String, String[])
*/
public String humanize( String lowerCaseAndUnderscoredWords,
String... removableTokens ) {
if (lowerCaseAndUnderscoredWords == null) return null;
String result = lowerCaseAndUnderscoredWords.trim();
if (result.length() == 0) return "";
// Remove a trailing "_id" token
result = result.replaceAll("_id$", "");
// Remove all of the tokens that should be removed
if (removableTokens != null) {
for (String removableToken : removableTokens) {
result = result.replaceAll(removableToken, "");
}
}
result = result.replaceAll("_+", " "); // replace all adjacent underscores with a single space
return capitalize(result);
}
/**
* Capitalizes all the words and replaces some characters in the string to create a nicer looking title. Underscores are
* changed to spaces, a trailing "_id" is removed, and any of the supplied tokens are removed. Like
* {@link #humanize(String, String[])}, this is meant for creating pretty output.
*
* Examples:
*
* <pre>
* inflector.titleCase("man from the boondocks") #=> "Man From The Boondocks"
* inflector.titleCase("x-men: the last stand") #=> "X Men: The Last Stand"
* </pre>
*
*
*
* @param words the input to be turned into title case
* @param removableTokens optional array of tokens that are to be removed
* @return the title-case version of the supplied words
*/
public String titleCase( String words,
String... removableTokens ) {
String result = humanize(words, removableTokens);
result = replaceAllWithUppercase(result, "\b([a-z])", 1); // change first char of each word to uppercase
return result;
}
/**
* Turns a non-negative number into an ordinal string used to denote the position in an ordered sequence, such as 1st, 2nd,
* 3rd, 4th.
*
* @param number the non-negative number
* @return the string with the number and ordinal suffix
*/
public String ordinalize( int number ) {
int remainder = number % 100;
String numberStr = Integer.toString(number);
if (11 <= number && number <= 13) return numberStr + "th";
remainder = number % 10;
if (remainder == 1) return numberStr + "st";
if (remainder == 2) return numberStr + "nd";
if (remainder == 3) return numberStr + "rd";
return numberStr + "th";
}
// ------------------------------------------------------------------------------------------------
// Management methods
// ------------------------------------------------------------------------------------------------
/**
* Determine whether the supplied word is considered uncountable by the {@link #pluralize(Object) pluralize} and
* {@link #singularize(Object) singularize} methods.
*
* @param word the word
* @return true if the plural and singular forms of the word are the same
*/
public boolean isUncountable( String word ) {
if (word == null) return false;
String trimmedLower = word.trim().toLowerCase();
return this.uncountables.contains(trimmedLower);
}
/**
* Get the set of words that are not processed by the Inflector. The resulting map is directly modifiable.
*
* @return the set of uncountable words
*/
public Set<String> getUncountables() {
return uncountables;
}
public void addPluralize( String rule,
String replacement ) {
final Rule pluralizeRule = new Rule(rule, replacement);
this.plurals.addFirst(pluralizeRule);
}
public void addSingularize( String rule,
String replacement ) {
final Rule singularizeRule = new Rule(rule, replacement);
this.singulars.addFirst(singularizeRule);
}
public void addIrregular( String singular,
String plural ) {
//CheckArg.isNotEmpty(singular, "singular rule");
//CheckArg.isNotEmpty(plural, "plural rule");
String singularRemainder = singular.length() > 1 ? singular.substring(1) : "";
String pluralRemainder = plural.length() > 1 ? plural.substring(1) : "";
addPluralize("(" + singular.charAt(0) + ")" + singularRemainder + "$", "" + pluralRemainder);
addSingularize("(" + plural.charAt(0) + ")" + pluralRemainder + "$", "" + singularRemainder);
}
public void addUncountable( String... words ) {
if (words == null || words.length == 0) return;
for (String word : words) {
if (word != null) uncountables.add(word.trim().toLowerCase());
}
}
/**
* Utility method to replace all occurrences given by the specific backreference with its uppercased form, and remove all
* other backreferences.
*
* The Java {@link Pattern regular expression processing} does not use the preprocessing directives <code>\l</code>,
* <code>\u</code>, <code>\L</code>, and <code>\U</code>. If so, such directives could be used in the replacement string
* to uppercase or lowercase the backreferences. For example, <code>\L1</code> would lowercase the first backreference, and
* <code>\u3</code> would uppercase the 3rd backreference.
*
*
* @param input
* @param regex
* @param groupNumberToUppercase
* @return the input string with the appropriate characters converted to upper-case
*/
protected static String replaceAllWithUppercase( String input,
String regex,
int groupNumberToUppercase ) {
Pattern underscoreAndDotPattern = Pattern.compile(regex);
Matcher matcher = underscoreAndDotPattern.matcher(input);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, matcher.group(groupNumberToUppercase).toUpperCase());
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* Completely remove all rules within this inflector.
*/
public void clear() {
this.uncountables.clear();
this.plurals.clear();
this.singulars.clear();
}
protected void initialize() {
Inflector inflect = this;
inflect.addPluralize("$", "s");
inflect.addPluralize("s$", "s");
inflect.addPluralize("(ax|test)is$", "es");
inflect.addPluralize("(octop|vir)us$", "i");
inflect.addPluralize("(octop|vir)i$", "i"); // already plural
inflect.addPluralize("(alias|status)$", "es");
inflect.addPluralize("(bu)s$", "ses");
inflect.addPluralize("(buffal|tomat)o$", "oes");
inflect.addPluralize("([ti])um$", "a");
inflect.addPluralize("([ti])a$", "a"); // already plural
inflect.addPluralize("sis$", "ses");
inflect.addPluralize("(?:([^f])fe|([lr])f)$", "ves");
inflect.addPluralize("(hive)$", "s");
inflect.addPluralize("([^aeiouy]|qu)y$", "ies");
inflect.addPluralize("(x|ch|ss|sh)$", "es");
inflect.addPluralize("(matr|vert|ind)ix|ex$", "ices");
inflect.addPluralize("([m|l])ouse$", "ice");
inflect.addPluralize("([m|l])ice$", "ice");
inflect.addPluralize("^(ox)$", "en");
inflect.addPluralize("(quiz)$", "zes");
// Need to check for the following words that are already pluralized:
inflect.addPluralize("(people|men|children|sexes|moves|stadiums)$", ""); // irregulars
inflect.addPluralize("(oxen|octopi|viri|aliases|quizzes)$", ""); // special rules
inflect.addSingularize("s$", "");
inflect.addSingularize("(s|si|u)s$", "s"); // '-us' and '-ss' are already singular
inflect.addSingularize("(n)ews$", "ews");
inflect.addSingularize("([ti])a$", "um");
inflect.addSingularize("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "sis");
inflect.addSingularize("(^analy)ses$", "sis");
inflect.addSingularize("(^analy)sis$", "sis"); // already singular, but ends in 's'
inflect.addSingularize("([^f])ves$", "fe");
inflect.addSingularize("(hive)s$", "");
inflect.addSingularize("(tive)s$", "");
inflect.addSingularize("([lr])ves$", "f");
inflect.addSingularize("([^aeiouy]|qu)ies$", "y");
inflect.addSingularize("(s)eries$", "eries");
inflect.addSingularize("(m)ovies$", "ovie");
inflect.addSingularize("(x|ch|ss|sh)es$", "");
inflect.addSingularize("([m|l])ice$", "ouse");
inflect.addSingularize("(bus)es$", "");
inflect.addSingularize("(o)es$", "");
inflect.addSingularize("(shoe)s$", "");
inflect.addSingularize("(cris|ax|test)is$", "is"); // already singular, but ends in 's'
inflect.addSingularize("(cris|ax|test)es$", "is");
inflect.addSingularize("(octop|vir)i$", "us");
inflect.addSingularize("(octop|vir)us$", "us"); // already singular, but ends in 's'
inflect.addSingularize("(alias|status)es$", "");
inflect.addSingularize("(alias|status)$", ""); // already singular, but ends in 's'
inflect.addSingularize("^(ox)en", "");
inflect.addSingularize("(vert|ind)ices$", "ex");
inflect.addSingularize("(matr)ices$", "ix");
inflect.addSingularize("(quiz)zes$", "");
inflect.addIrregular("person", "people");
inflect.addIrregular("man", "men");
inflect.addIrregular("child", "children");
inflect.addIrregular("sex", "sexes");
inflect.addIrregular("move", "moves");
inflect.addIrregular("stadium", "stadiums");
inflect.addUncountable("equipment", "information", "rice", "money", "species", "series", "fish", "sheep");
}
}