/*******************************************************************************
 * This file is part of GECAMed.
 * 
 * GECAMed is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License (L-GPL) as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * GECAMed is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License (L-GPL)
 * along with GECAMed.  If not, see <http://www.gnu.org/licenses/>.
 * 
 * GECAMed is Copyrighted by the Centre de Recherche Public Henri Tudor (http://www.tudor.lu)
 * (c) CRP Henri Tudor, Luxembourg, 2008
 *******************************************************************************/
package lu.tudor.santec.gecamed.core.utils;

import java.util.Enumeration;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//***************************************************************************
//* Class Definition and Members                                            *
//***************************************************************************

/**
 * The StringUtilites class is a collection of utility methods related to
 * string processing
 */

public class StringUtilities 
{

	private static Pattern
		
	c_FirstCharacterPattern = Pattern.compile ("\\b([a-z]{1})");

	private static final Hashtable <Pattern,String> m_UmlautTranslator = new Hashtable <Pattern,String> ();
    
    static {
   			m_UmlautTranslator.put( Pattern.compile ("\u00ee")	 ,"i");	// LATIN SMALL LETTER I WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00fb")	 ,"u");	// LATIN SMALL LETTER U WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00ea")	 ,"e");	// LATIN SMALL LETTER E WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00f4")	 ,"o");	// LATIN SMALL LETTER O WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00e2")	 ,"a");	// LATIN SMALL LETTER A WITH CIRCUMFLEX
 		   	
   			m_UmlautTranslator.put( Pattern.compile ("\u00ec")	 ,"i");	// LATIN SMALL LETTER I WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00f9")	 ,"u");	// LATIN SMALL LETTER U WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00e8")	 ,"e");	// LATIN SMALL LETTER E WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00f2")	 ,"o");	// LATIN SMALL LETTER O WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00e0")	 ,"a");	// LATIN SMALL LETTER A WITH GRAVE
    
   			m_UmlautTranslator.put( Pattern.compile ("\u00ed")	 ,"i");	// LATIN SMALL LETTER I WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00fa")	 ,"u");	// LATIN SMALL LETTER U WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00e9")	 ,"e");	// LATIN SMALL LETTER E WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00f3")	 ,"o");	// LATIN SMALL LETTER O WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00e1")	 ,"a");	// LATIN SMALL LETTER A WITH ACUTE
   
   			m_UmlautTranslator.put( Pattern.compile ("\u00ef")	 ,"i");	// LATIN SMALL LETTER I WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00fc")	 ,"u");	// LATIN SMALL LETTER U WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00eb")	 ,"e");	// LATIN SMALL LETTER E WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00f6")	 ,"o");	// LATIN SMALL LETTER O WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00e4")	 ,"a");	// LATIN SMALL LETTER A WITH DIAERESIS
    		
 			m_UmlautTranslator.put( Pattern.compile ("\u00df")	,"ss");	// LATIN SMALL LETTER SHARP S
 			m_UmlautTranslator.put( Pattern.compile ("\u00e7")	,"c");	// LATIN SMALL LETTER C WITH CEDILLA  			     

  			m_UmlautTranslator.put( Pattern.compile ("\u00ce")	 ,"I");	// LATIN CAPITAL LETTER I WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00db")	 ,"U");	// LATIN CAPITAL LETTER U WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00ca")	 ,"E");	// LATIN CAPITAL LETTER E WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00d4")	 ,"O");	// LATIN CAPITAL LETTER O WITH CIRCUMFLEX
   			m_UmlautTranslator.put( Pattern.compile ("\u00c2")	 ,"A");	// LATIN CAPITAL LETTER A WITH CIRCUMFLEX
 		   	
   			m_UmlautTranslator.put( Pattern.compile ("\u00cc")	 ,"I");	// LATIN CAPITAL LETTER I WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00d9")	 ,"U");	// LATIN CAPITAL LETTER U WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00c8")	 ,"E");	// LATIN CAPITAL LETTER E WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00d2")	 ,"O");	// LATIN CAPITAL LETTER O WITH GRAVE
   			m_UmlautTranslator.put( Pattern.compile ("\u00c0")	 ,"A");	// LATIN CAPITAL LETTER A WITH GRAVE
    
   			m_UmlautTranslator.put( Pattern.compile ("\u00cd")	 ,"I");	// LATIN CAPITAL LETTER I WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00da")	 ,"U");	// LATIN CAPITAL LETTER U WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00c9")	 ,"E");	// LATIN CAPITAL LETTER E WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00d3")	 ,"O");	// LATIN CAPITAL LETTER O WITH ACUTE
   			m_UmlautTranslator.put( Pattern.compile ("\u00c1")	 ,"A");	// LATIN CAPITAL LETTER A WITH ACUTE
   
   			m_UmlautTranslator.put( Pattern.compile ("\u00cf")	 ,"I");	// LATIN CAPITAL LETTER I WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00dc")	 ,"U");	// LATIN CAPITAL LETTER U WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00cb")	 ,"E");	// LATIN CAPITAL LETTER E WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00d6")	 ,"O");	// LATIN CAPITAL LETTER O WITH DIAERESIS
   			m_UmlautTranslator.put( Pattern.compile ("\u00c4")	 ,"A");	// LATIN CAPITAL LETTER A WITH DIAERESIS
			
   			m_UmlautTranslator.put( Pattern.compile ("\u00c7")	 ,"C");	// LATIN CAPITAL LETTER C WITH CEDILLA  			     
   
    
    
    		};

	
	
//---------------------------------------------------------------------------
/**
 * The method calculates the Levnshtein distance between the specified strings.
 * In information theory and computer science, the Levenshtein distance is a string 
 * metric which is one way to measure edit distance. The Levenshtein distance between 
 * two strings is given by the minimum number of operations needed to transform one 
 * string into the other, where an operation is an insertion, deletion, or substitution 
 * of a single character. It is named after Vladimir Levenshtein, who considered 
 * this distance in 1965.[1] It is useful in applications that need to determine 
 * how similar two strings are, such as spell checkers.
 * @param s specifies the first string
 * @param t specifies the second string
 * @return the levenshtein distance between the two specified strings.
 * @author Michael Gilleland
 */
//---------------------------------------------------------------------------

public static int getLevenshteinDistance (String s, String t) 
	{
	if (s == null || t == null) 
		{
	    throw new IllegalArgumentException("Strings must not be null");
		}
			
	  /*
	    The difference between this impl. and the previous is that, rather 
	     than creating and retaining a matrix of size s.length()+1 by t.length()+1, 
	     we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
	     is the 'current working' distance array that maintains the newest distance cost
	     counts as we iterate through the characters of String s.  Each time we increment
	     the index of String t we are comparing, d is copied to p, the second int[].  Doing so
	     allows us to retain the previous cost counts as required by the algorithm (taking 
	     the minimum of the cost count to the left, up one, and diagonally up and to the left
	     of the current cost count being calculated).  (Note that the arrays aren't really 
	     copied anymore, just switched...this is clearly much better than cloning an array 
	     or doing a System.arraycopy() each time  through the outer loop.)

	     Effectively, the difference between the two implementations is this one does not 
	     cause an out of memory condition when calculating the LD over two very large strings.  		
	  */		
			
	  int n = s.length(); // length of s
	  int m = t.length(); // length of t
			
	  if (n == 0) {
	    return m;
	  } else if (m == 0) {
	    return n;
	  }

	  int p[] = new int[n+1]; //'previous' cost array, horizontally
	  int d[] = new int[n+1]; // cost array, horizontally
	  int _d[]; //placeholder to assist in swapping p and d

	  // indexes into strings s and t
	  int i; // iterates through s
	  int j; // iterates through t

	  char t_j; // jth character of t

	  int cost; // cost

	  for (i = 0; i<=n; i++) {
	     p[i] = i;
	  }
			
	  for (j = 1; j<=m; j++) {
	     t_j = t.charAt(j-1);
	     d[0] = j;
			
	     for (i=1; i<=n; i++) {
	        cost = s.charAt(i-1)==t_j ? 0 : 1;
	        // minimum of cell to the left+1, to the top+1, diagonally left and up +cost				
	        d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);  
	     }

	     // copy current distance counts to 'previous row' distance counts
	     _d = p;
	     p = d;
	     d = _d;
	  } 
			
	  // our last action in the above loop was to switch d and p, so p now 
	  // actually has the most recent cost counts
	  return p[n];
	}
	    
//---------------------------------------------------------------------------
/**
 * The capitalizeWords methods takes a string as input and capitalizes the
 * first character of every word.
 * @param p_Original specifies the original string to captialize words of.
 * @return the processed string with capitalized words.
 */
//---------------------------------------------------------------------------

public static String	capitalizeWords (String p_Original)
	{
	Matcher	l_WordMatcher;
	String	l_Capitalized;
	int		l_LastMatch;
	int		l_NewMatch;
	
	if (p_Original == null) return null;
	
	p_Original	 = p_Original.toLowerCase();
	l_WordMatcher = c_FirstCharacterPattern.matcher(p_Original);
	l_Capitalized = "";
	l_LastMatch	 = 0;

	while (l_WordMatcher.find(l_LastMatch)) 
		{
		l_NewMatch = l_WordMatcher.end();
		if (l_LastMatch >= 0)
			l_Capitalized += p_Original.substring(l_LastMatch,l_NewMatch-1);
		l_Capitalized += l_WordMatcher.group(1).toUpperCase();
		l_LastMatch    = l_NewMatch;
		}
		
	l_Capitalized += p_Original.substring(l_LastMatch);

	return l_Capitalized;
	}

//---------------------------------------------------------------------------
/**
 * The deUmlaut method removes umlauts and accented UTF-8 characters from specified
 * string.
 * @param p_Original specifies the original string to remove umlauts from.
 * @return the processed string without umlauts.
 */
//---------------------------------------------------------------------------

public static String deUmlaut (String p_Original)
	{
	Enumeration <Pattern>	l_Patterns;
	Pattern					l_Pattern;
	String					l_Replacement;
	
	l_Patterns = m_UmlautTranslator.keys();
	while (l_Patterns.hasMoreElements())
		{
		l_Pattern = l_Patterns.nextElement();
		l_Replacement = m_UmlautTranslator.get(l_Pattern);
		
		p_Original = l_Pattern.matcher(p_Original).replaceAll(l_Replacement);
		}
	
	return p_Original;
	}

//---------------------------------------------------------------------------
//***************************************************************************
//* End of Class															*
//***************************************************************************
	
}
