QGIS API Documentation  3.0.2-Girona (307d082)
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 #include <QRegExp>
19 #include <QStringList>
20 #include <QTextBoundaryFinder>
21 
22 QString QgsStringUtils::capitalize( const QString &string, QgsStringUtils::Capitalization capitalization )
23 {
24  if ( string.isEmpty() )
25  return QString();
26 
27  switch ( capitalization )
28  {
29  case MixedCase:
30  return string;
31 
32  case AllUppercase:
33  return string.toUpper();
34 
35  case AllLowercase:
36  return string.toLower();
37 
39  {
40  QString temp = string;
41 
42  QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
43  QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
44 
45  wordSplitter.setPosition( 0 );
46  bool first = true;
47  while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
48  || wordSplitter.toNextBoundary() >= 0 )
49  {
50  first = false;
51  letterSplitter.setPosition( wordSplitter.position() );
52  letterSplitter.toNextBoundary();
53  QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
54  temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
55  }
56  return temp;
57  }
58 
59  }
60  // no warnings
61  return string;
62 }
63 
64 int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
65 {
66  int length1 = string1.length();
67  int length2 = string2.length();
68 
69  //empty strings? solution is trivial...
70  if ( string1.isEmpty() )
71  {
72  return length2;
73  }
74  else if ( string2.isEmpty() )
75  {
76  return length1;
77  }
78 
79  //handle case sensitive flag (or not)
80  QString s1( caseSensitive ? string1 : string1.toLower() );
81  QString s2( caseSensitive ? string2 : string2.toLower() );
82 
83  const QChar *s1Char = s1.constData();
84  const QChar *s2Char = s2.constData();
85 
86  //strip out any common prefix
87  int commonPrefixLen = 0;
88  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
89  {
90  commonPrefixLen++;
91  length1--;
92  length2--;
93  s1Char++;
94  s2Char++;
95  }
96 
97  //strip out any common suffix
98  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
99  {
100  length1--;
101  length2--;
102  }
103 
104  //fully checked either string? if so, the answer is easy...
105  if ( length1 == 0 )
106  {
107  return length2;
108  }
109  else if ( length2 == 0 )
110  {
111  return length1;
112  }
113 
114  //ensure the inner loop is longer
115  if ( length1 > length2 )
116  {
117  std::swap( s1, s2 );
118  std::swap( length1, length2 );
119  }
120 
121  //levenshtein algorithm begins here
122  QVector< int > col;
123  col.fill( 0, length2 + 1 );
124  QVector< int > prevCol;
125  prevCol.reserve( length2 + 1 );
126  for ( int i = 0; i < length2 + 1; ++i )
127  {
128  prevCol << i;
129  }
130  const QChar *s2start = s2Char;
131  for ( int i = 0; i < length1; ++i )
132  {
133  col[0] = i + 1;
134  s2Char = s2start;
135  for ( int j = 0; j < length2; ++j )
136  {
137  col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
138  s2Char++;
139  }
140  col.swap( prevCol );
141  s1Char++;
142  }
143  return prevCol[length2];
144 }
145 
146 QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
147 {
148  if ( string1.isEmpty() || string2.isEmpty() )
149  {
150  //empty strings, solution is trivial...
151  return QString();
152  }
153 
154  //handle case sensitive flag (or not)
155  QString s1( caseSensitive ? string1 : string1.toLower() );
156  QString s2( caseSensitive ? string2 : string2.toLower() );
157 
158  if ( s1 == s2 )
159  {
160  //another trivial case, identical strings
161  return s1;
162  }
163 
164  int *currentScores = new int [ s2.length()];
165  int *previousScores = new int [ s2.length()];
166  int maxCommonLength = 0;
167  int lastMaxBeginIndex = 0;
168 
169  const QChar *s1Char = s1.constData();
170  const QChar *s2Char = s2.constData();
171  const QChar *s2Start = s2Char;
172 
173  for ( int i = 0; i < s1.length(); ++i )
174  {
175  for ( int j = 0; j < s2.length(); ++j )
176  {
177  if ( *s1Char != *s2Char )
178  {
179  currentScores[j] = 0;
180  }
181  else
182  {
183  if ( i == 0 || j == 0 )
184  {
185  currentScores[j] = 1;
186  }
187  else
188  {
189  currentScores[j] = 1 + previousScores[j - 1];
190  }
191 
192  if ( maxCommonLength < currentScores[j] )
193  {
194  maxCommonLength = currentScores[j];
195  lastMaxBeginIndex = i;
196  }
197  }
198  s2Char++;
199  }
200  std::swap( currentScores, previousScores );
201  s1Char++;
202  s2Char = s2Start;
203  }
204  delete [] currentScores;
205  delete [] previousScores;
206  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
207 }
208 
209 int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
210 {
211  if ( string1.isEmpty() && string2.isEmpty() )
212  {
213  //empty strings, solution is trivial...
214  return 0;
215  }
216 
217  if ( string1.length() != string2.length() )
218  {
219  //invalid inputs
220  return -1;
221  }
222 
223  //handle case sensitive flag (or not)
224  QString s1( caseSensitive ? string1 : string1.toLower() );
225  QString s2( caseSensitive ? string2 : string2.toLower() );
226 
227  if ( s1 == s2 )
228  {
229  //another trivial case, identical strings
230  return 0;
231  }
232 
233  int distance = 0;
234  const QChar *s1Char = s1.constData();
235  const QChar *s2Char = s2.constData();
236 
237  for ( int i = 0; i < string1.length(); ++i )
238  {
239  if ( *s1Char != *s2Char )
240  distance++;
241  s1Char++;
242  s2Char++;
243  }
244 
245  return distance;
246 }
247 
248 QString QgsStringUtils::soundex( const QString &string )
249 {
250  if ( string.isEmpty() )
251  return QString();
252 
253  QString tmp = string.toUpper();
254 
255  //strip non character codes, and vowel like characters after the first character
256  QChar *char1 = tmp.data();
257  QChar *char2 = tmp.data();
258  int outLen = 0;
259  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
260  {
261  if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
262  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
263  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
264  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
265  {
266  *char1 = *char2;
267  char1++;
268  outLen++;
269  }
270  }
271  tmp.truncate( outLen );
272 
273  QChar *tmpChar = tmp.data();
274  tmpChar++;
275  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
276  {
277  switch ( ( *tmpChar ).unicode() )
278  {
279  case 0x42:
280  case 0x46:
281  case 0x50:
282  case 0x56:
283  tmp.replace( i, 1, QChar( 0x31 ) );
284  break;
285 
286  case 0x43:
287  case 0x47:
288  case 0x4A:
289  case 0x4B:
290  case 0x51:
291  case 0x53:
292  case 0x58:
293  case 0x5A:
294  tmp.replace( i, 1, QChar( 0x32 ) );
295  break;
296 
297  case 0x44:
298  case 0x54:
299  tmp.replace( i, 1, QChar( 0x33 ) );
300  break;
301 
302  case 0x4C:
303  tmp.replace( i, 1, QChar( 0x34 ) );
304  break;
305 
306  case 0x4D:
307  case 0x4E:
308  tmp.replace( i, 1, QChar( 0x35 ) );
309  break;
310 
311  case 0x52:
312  tmp.replace( i, 1, QChar( 0x36 ) );
313  break;
314  }
315  }
316 
317  //remove adjacent duplicates
318  char1 = tmp.data();
319  char2 = tmp.data();
320  char2++;
321  outLen = 1;
322  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
323  {
324  if ( *char2 != *char1 )
325  {
326  char1++;
327  *char1 = *char2;
328  outLen++;
329  if ( outLen == 4 )
330  break;
331  }
332  }
333  tmp.truncate( outLen );
334  if ( tmp.length() < 4 )
335  {
336  tmp.append( "000" );
337  tmp.truncate( 4 );
338  }
339 
340  return tmp;
341 }
342 
343 QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
344 {
345  QString converted = string;
346 
347  // http://alanstorm.com/url_regex_explained
348  // note - there's more robust implementations available, but we need one which works within the limitation of QRegExp
349  static QRegExp urlRegEx( "(\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/))))" );
350  static QRegExp protoRegEx( "^(?:f|ht)tps?://" );
351  static QRegExp emailRegEx( "([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)" );
352 
353  int offset = 0;
354  bool found = false;
355  while ( urlRegEx.indexIn( converted, offset ) != -1 )
356  {
357  found = true;
358  QString url = urlRegEx.cap( 1 );
359  QString protoUrl = url;
360  if ( protoRegEx.indexIn( protoUrl ) == -1 )
361  {
362  protoUrl.prepend( "http://" );
363  }
364  QString anchor = QStringLiteral( "<a href=\"%1\">%2</a>" ).arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
365  converted.replace( urlRegEx.pos( 1 ), url.length(), anchor );
366  offset = urlRegEx.pos( 1 ) + anchor.length();
367  }
368  offset = 0;
369  while ( emailRegEx.indexIn( converted, offset ) != -1 )
370  {
371  found = true;
372  QString email = emailRegEx.cap( 1 );
373  QString anchor = QStringLiteral( "<a href=\"mailto:%1\">%1</a>" ).arg( email.toHtmlEscaped(), email.toHtmlEscaped() );
374  converted.replace( emailRegEx.pos( 1 ), email.length(), anchor );
375  offset = emailRegEx.pos( 1 ) + anchor.length();
376  }
377 
378  if ( foundLinks )
379  *foundLinks = found;
380 
381  return converted;
382 }
383 
384 QgsStringReplacement::QgsStringReplacement( const QString &match, const QString &replacement, bool caseSensitive, bool wholeWordOnly )
385  : mMatch( match )
386  , mReplacement( replacement )
387  , mCaseSensitive( caseSensitive )
388  , mWholeWordOnly( wholeWordOnly )
389 {
390  if ( mWholeWordOnly )
391  mRx = QRegExp( QString( "\\b%1\\b" ).arg( mMatch ),
392  mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
393 }
394 
395 QString QgsStringReplacement::process( const QString &input ) const
396 {
397  QString result = input;
398  if ( !mWholeWordOnly )
399  {
400  return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
401  }
402  else
403  {
404  return result.replace( mRx, mReplacement );
405  }
406 }
407 
409 {
410  QgsStringMap map;
411  map.insert( QStringLiteral( "match" ), mMatch );
412  map.insert( QStringLiteral( "replace" ), mReplacement );
413  map.insert( QStringLiteral( "caseSensitive" ), mCaseSensitive ? "1" : "0" );
414  map.insert( QStringLiteral( "wholeWord" ), mWholeWordOnly ? "1" : "0" );
415  return map;
416 }
417 
419 {
420  return QgsStringReplacement( properties.value( QStringLiteral( "match" ) ),
421  properties.value( QStringLiteral( "replace" ) ),
422  properties.value( QStringLiteral( "caseSensitive" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ),
423  properties.value( QStringLiteral( "wholeWord" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ) );
424 }
425 
426 QString QgsStringReplacementCollection::process( const QString &input ) const
427 {
428  QString result = input;
429  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
430  {
431  result = r.process( result );
432  }
433  return result;
434 }
435 
436 void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
437 {
438  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
439  {
440  QgsStringMap props = r.properties();
441  QDomElement propEl = doc.createElement( QStringLiteral( "replacement" ) );
442  QgsStringMap::const_iterator it = props.constBegin();
443  for ( ; it != props.constEnd(); ++it )
444  {
445  propEl.setAttribute( it.key(), it.value() );
446  }
447  elem.appendChild( propEl );
448  }
449 }
450 
451 void QgsStringReplacementCollection::readXml( const QDomElement &elem )
452 {
453  mReplacements.clear();
454  QDomNodeList nodelist = elem.elementsByTagName( QStringLiteral( "replacement" ) );
455  for ( int i = 0; i < nodelist.count(); i++ )
456  {
457  QDomElement replacementElem = nodelist.at( i ).toElement();
458  QDomNamedNodeMap nodeMap = replacementElem.attributes();
459 
460  QgsStringMap props;
461  for ( int j = 0; j < nodeMap.count(); ++j )
462  {
463  props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
464  }
465  mReplacements << QgsStringReplacement::fromProperties( props );
466  }
467 
468 }
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
A representation of a single string replacement.
QMap< QString, QString > QgsStringMap
Definition: qgis.h:479
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
static QString capitalize(const QString &string, Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
Convert just the first letter of each word to uppercase, leave the rest untouched.
Convert all characters to uppercase.
Capitalization
Capitalization options.
QgsStringMap properties() const
Returns a map of the replacement properties.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
Mixed case, ie no change.
Convert all characters to lowercase.
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ...
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.