QGIS API Documentation  2.12.0-Lyon
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 
19 int QgsStringUtils::levenshteinDistance( const QString& string1, const QString& string2, bool caseSensitive )
20 {
21  int length1 = string1.length();
22  int length2 = string2.length();
23 
24  //empty strings? solution is trivial...
25  if ( string1.isEmpty() )
26  {
27  return length2;
28  }
29  else if ( string2.isEmpty() )
30  {
31  return length1;
32  }
33 
34  //handle case sensitive flag (or not)
35  QString s1( caseSensitive ? string1 : string1.toLower() );
36  QString s2( caseSensitive ? string2 : string2.toLower() );
37 
38  const QChar* s1Char = s1.constData();
39  const QChar* s2Char = s2.constData();
40 
41  //strip out any common prefix
42  int commonPrefixLen = 0;
43  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
44  {
45  commonPrefixLen++;
46  length1--;
47  length2--;
48  s1Char++;
49  s2Char++;
50  }
51 
52  //strip out any common suffix
53  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
54  {
55  length1--;
56  length2--;
57  }
58 
59  //fully checked either string? if so, the answer is easy...
60  if ( length1 == 0 )
61  {
62  return length2;
63  }
64  else if ( length2 == 0 )
65  {
66  return length1;
67  }
68 
69  //ensure the inner loop is longer
70  if ( length1 > length2 )
71  {
72  qSwap( s1, s2 );
73  qSwap( length1, length2 );
74  }
75 
76  //levenshtein algorithm begins here
77  QVector< int > col;
78  col.fill( 0, length2 + 1 );
79  QVector< int > prevCol;
80  prevCol.reserve( length2 + 1 );
81  for ( int i = 0; i < length2 + 1; ++i )
82  {
83  prevCol << i;
84  }
85  const QChar* s2start = s2Char;
86  for ( int i = 0; i < length1; ++i )
87  {
88  col[0] = i + 1;
89  s2Char = s2start;
90  for ( int j = 0; j < length2; ++j )
91  {
92  col[j + 1] = qMin( qMin( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + (( *s1Char == *s2Char ) ? 0 : 1 ) );
93  s2Char++;
94  }
95  col.swap( prevCol );
96  s1Char++;
97  }
98  return prevCol[length2];
99 }
100 
101 QString QgsStringUtils::longestCommonSubstring( const QString& string1, const QString& string2, bool caseSensitive )
102 {
103  if ( string1.isEmpty() || string2.isEmpty() )
104  {
105  //empty strings, solution is trivial...
106  return QString();
107  }
108 
109  //handle case sensitive flag (or not)
110  QString s1( caseSensitive ? string1 : string1.toLower() );
111  QString s2( caseSensitive ? string2 : string2.toLower() );
112 
113  if ( s1 == s2 )
114  {
115  //another trivial case, identical strings
116  return s1;
117  }
118 
119  int* currentScores = new int [ s2.length()];
120  int* previousScores = new int [ s2.length()];
121  int maxCommonLength = 0;
122  int lastMaxBeginIndex = 0;
123 
124  const QChar* s1Char = s1.constData();
125  const QChar* s2Char = s2.constData();
126  const QChar* s2Start = s2Char;
127 
128  for ( int i = 0; i < s1.length(); ++i )
129  {
130  for ( int j = 0; j < s2.length(); ++j )
131  {
132  if ( *s1Char != *s2Char )
133  {
134  currentScores[j] = 0;
135  }
136  else
137  {
138  if ( i == 0 || j == 0 )
139  {
140  currentScores[j] = 1;
141  }
142  else
143  {
144  currentScores[j] = 1 + previousScores[j - 1];
145  }
146 
147  if ( maxCommonLength < currentScores[j] )
148  {
149  maxCommonLength = currentScores[j];
150  lastMaxBeginIndex = i;
151  }
152  }
153  s2Char++;
154  }
155  qSwap( currentScores, previousScores );
156  s1Char++;
157  s2Char = s2Start;
158  }
159  delete [] currentScores;
160  delete [] previousScores;
161  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
162 }
163 
164 int QgsStringUtils::hammingDistance( const QString& string1, const QString& string2, bool caseSensitive )
165 {
166  if ( string1.isEmpty() && string2.isEmpty() )
167  {
168  //empty strings, solution is trivial...
169  return 0;
170  }
171 
172  if ( string1.length() != string2.length() )
173  {
174  //invalid inputs
175  return -1;
176  }
177 
178  //handle case sensitive flag (or not)
179  QString s1( caseSensitive ? string1 : string1.toLower() );
180  QString s2( caseSensitive ? string2 : string2.toLower() );
181 
182  if ( s1 == s2 )
183  {
184  //another trivial case, identical strings
185  return 0;
186  }
187 
188  int distance = 0;
189  const QChar* s1Char = s1.constData();
190  const QChar* s2Char = s2.constData();
191 
192  for ( int i = 0; i < string1.length(); ++i )
193  {
194  if ( *s1Char != *s2Char )
195  distance++;
196  s1Char++;
197  s2Char++;
198  }
199 
200  return distance;
201 }
202 
204 {
205  if ( string.isEmpty() )
206  return QString();
207 
208  QString tmp = string.toUpper();
209 
210  //strip non character codes, and vowel like characters after the first character
211  QChar* char1 = tmp.data();
212  QChar* char2 = tmp.data();
213  int outLen = 0;
214  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
215  {
216  if (( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || (( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
217  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
218  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
219  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
220  {
221  *char1 = *char2;
222  char1++;
223  outLen++;
224  }
225  }
226  tmp.truncate( outLen );
227 
228  QChar* tmpChar = tmp.data();
229  tmpChar++;
230  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
231  {
232  switch (( *tmpChar ).unicode() )
233  {
234  case 0x42:
235  case 0x46:
236  case 0x50:
237  case 0x56:
238  tmp.replace( i, 1, QChar( 0x31 ) );
239  break;
240 
241  case 0x43:
242  case 0x47:
243  case 0x4A:
244  case 0x4B:
245  case 0x51:
246  case 0x53:
247  case 0x58:
248  case 0x5A:
249  tmp.replace( i, 1, QChar( 0x32 ) );
250  break;
251 
252  case 0x44:
253  case 0x54:
254  tmp.replace( i, 1, QChar( 0x33 ) );
255  break;
256 
257  case 0x4C:
258  tmp.replace( i, 1, QChar( 0x34 ) );
259  break;
260 
261  case 0x4D:
262  case 0x4E:
263  tmp.replace( i, 1, QChar( 0x35 ) );
264  break;
265 
266  case 0x52:
267  tmp.replace( i, 1, QChar( 0x36 ) );
268  break;
269  }
270  }
271 
272  //remove adjacent duplicates
273  char1 = tmp.data();
274  char2 = tmp.data();
275  char2++;
276  outLen = 1;
277  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
278  {
279  if ( *char2 != *char1 )
280  {
281  char1++;
282  *char1 = *char2;
283  outLen++;
284  if ( outLen == 4 )
285  break;
286  }
287  }
288  tmp.truncate( outLen );
289  if ( tmp.length() < 4 )
290  {
291  tmp.append( "000" );
292  tmp.truncate( 4 );
293  }
294 
295  return tmp;
296 }
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
QString & append(QChar ch)
QString toUpper() const
void truncate(int position)
QVector< T > & fill(const T &value, int size)
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
bool isEmpty() const
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
QString toLower() const
void reserve(int size)
QString & replace(int position, int n, QChar after)
QString mid(int position, int n) const
int length() const
void swap(QVector< T > &other)
QChar * data()
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.