QGIS API Documentation  3.6.0-Noosa (5873452)
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 #include <QRegExp>
19 #include <QStringList>
20 #include <QTextBoundaryFinder>
21 #include <QRegularExpression>
22 
23 QString QgsStringUtils::capitalize( const QString &string, QgsStringUtils::Capitalization capitalization )
24 {
25  if ( string.isEmpty() )
26  return QString();
27 
28  switch ( capitalization )
29  {
30  case MixedCase:
31  return string;
32 
33  case AllUppercase:
34  return string.toUpper();
35 
36  case AllLowercase:
37  return string.toLower();
38 
40  {
41  QString temp = string;
42 
43  QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
44  QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
45 
46  wordSplitter.setPosition( 0 );
47  bool first = true;
48  while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
49  || wordSplitter.toNextBoundary() >= 0 )
50  {
51  first = false;
52  letterSplitter.setPosition( wordSplitter.position() );
53  letterSplitter.toNextBoundary();
54  QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
55  temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
56  }
57  return temp;
58  }
59 
60  case TitleCase:
61  {
62  // yes, this is MASSIVELY simplifying the problem!!
63 
64  static QStringList smallWords;
65  static QStringList newPhraseSeparators;
66  static QRegularExpression splitWords;
67  if ( smallWords.empty() )
68  {
69  smallWords = QObject::tr( "a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|s|the|to|vs.|vs|via" ).split( '|' );
70  newPhraseSeparators = QObject::tr( ".|:" ).split( '|' );
71  splitWords = QRegularExpression( QStringLiteral( "\\b" ), QRegularExpression::UseUnicodePropertiesOption );
72  }
73 
74  const QStringList parts = string.split( splitWords, QString::SkipEmptyParts );
75  QString result;
76  bool firstWord = true;
77  int i = 0;
78  int lastWord = parts.count() - 1;
79  for ( const QString &word : qgis::as_const( parts ) )
80  {
81  if ( newPhraseSeparators.contains( word.trimmed() ) )
82  {
83  firstWord = true;
84  result += word;
85  }
86  else if ( firstWord || ( i == lastWord ) || !smallWords.contains( word ) )
87  {
88  result += word.at( 0 ).toUpper() + word.mid( 1 );
89  firstWord = false;
90  }
91  else
92  {
93  result += word;
94  }
95  i++;
96  }
97  return result;
98  }
99 
100  case UpperCamelCase:
101  QString result = QgsStringUtils::capitalize( string.toLower(), QgsStringUtils::ForceFirstLetterToCapital ).simplified();
102  result.remove( ' ' );
103  return result;
104  }
105  // no warnings
106  return string;
107 }
108 
109 // original code from http://www.qtcentre.org/threads/52456-HTML-Unicode-ampersand-encoding
110 QString QgsStringUtils::ampersandEncode( const QString &string )
111 {
112  QString encoded;
113  for ( int i = 0; i < string.size(); ++i )
114  {
115  QChar ch = string.at( i );
116  if ( ch.unicode() > 160 )
117  encoded += QStringLiteral( "&#%1;" ).arg( static_cast< int >( ch.unicode() ) );
118  else if ( ch.unicode() == 38 )
119  encoded += QStringLiteral( "&amp;" );
120  else if ( ch.unicode() == 60 )
121  encoded += QStringLiteral( "&lt;" );
122  else if ( ch.unicode() == 62 )
123  encoded += QStringLiteral( "&gt;" );
124  else
125  encoded += ch;
126  }
127  return encoded;
128 }
129 
130 int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
131 {
132  int length1 = string1.length();
133  int length2 = string2.length();
134 
135  //empty strings? solution is trivial...
136  if ( string1.isEmpty() )
137  {
138  return length2;
139  }
140  else if ( string2.isEmpty() )
141  {
142  return length1;
143  }
144 
145  //handle case sensitive flag (or not)
146  QString s1( caseSensitive ? string1 : string1.toLower() );
147  QString s2( caseSensitive ? string2 : string2.toLower() );
148 
149  const QChar *s1Char = s1.constData();
150  const QChar *s2Char = s2.constData();
151 
152  //strip out any common prefix
153  int commonPrefixLen = 0;
154  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
155  {
156  commonPrefixLen++;
157  length1--;
158  length2--;
159  s1Char++;
160  s2Char++;
161  }
162 
163  //strip out any common suffix
164  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
165  {
166  length1--;
167  length2--;
168  }
169 
170  //fully checked either string? if so, the answer is easy...
171  if ( length1 == 0 )
172  {
173  return length2;
174  }
175  else if ( length2 == 0 )
176  {
177  return length1;
178  }
179 
180  //ensure the inner loop is longer
181  if ( length1 > length2 )
182  {
183  std::swap( s1, s2 );
184  std::swap( length1, length2 );
185  }
186 
187  //levenshtein algorithm begins here
188  QVector< int > col;
189  col.fill( 0, length2 + 1 );
190  QVector< int > prevCol;
191  prevCol.reserve( length2 + 1 );
192  for ( int i = 0; i < length2 + 1; ++i )
193  {
194  prevCol << i;
195  }
196  const QChar *s2start = s2Char;
197  for ( int i = 0; i < length1; ++i )
198  {
199  col[0] = i + 1;
200  s2Char = s2start;
201  for ( int j = 0; j < length2; ++j )
202  {
203  col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
204  s2Char++;
205  }
206  col.swap( prevCol );
207  s1Char++;
208  }
209  return prevCol[length2];
210 }
211 
212 QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
213 {
214  if ( string1.isEmpty() || string2.isEmpty() )
215  {
216  //empty strings, solution is trivial...
217  return QString();
218  }
219 
220  //handle case sensitive flag (or not)
221  QString s1( caseSensitive ? string1 : string1.toLower() );
222  QString s2( caseSensitive ? string2 : string2.toLower() );
223 
224  if ( s1 == s2 )
225  {
226  //another trivial case, identical strings
227  return s1;
228  }
229 
230  int *currentScores = new int [ s2.length()];
231  int *previousScores = new int [ s2.length()];
232  int maxCommonLength = 0;
233  int lastMaxBeginIndex = 0;
234 
235  const QChar *s1Char = s1.constData();
236  const QChar *s2Char = s2.constData();
237  const QChar *s2Start = s2Char;
238 
239  for ( int i = 0; i < s1.length(); ++i )
240  {
241  for ( int j = 0; j < s2.length(); ++j )
242  {
243  if ( *s1Char != *s2Char )
244  {
245  currentScores[j] = 0;
246  }
247  else
248  {
249  if ( i == 0 || j == 0 )
250  {
251  currentScores[j] = 1;
252  }
253  else
254  {
255  currentScores[j] = 1 + previousScores[j - 1];
256  }
257 
258  if ( maxCommonLength < currentScores[j] )
259  {
260  maxCommonLength = currentScores[j];
261  lastMaxBeginIndex = i;
262  }
263  }
264  s2Char++;
265  }
266  std::swap( currentScores, previousScores );
267  s1Char++;
268  s2Char = s2Start;
269  }
270  delete [] currentScores;
271  delete [] previousScores;
272  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
273 }
274 
275 int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
276 {
277  if ( string1.isEmpty() && string2.isEmpty() )
278  {
279  //empty strings, solution is trivial...
280  return 0;
281  }
282 
283  if ( string1.length() != string2.length() )
284  {
285  //invalid inputs
286  return -1;
287  }
288 
289  //handle case sensitive flag (or not)
290  QString s1( caseSensitive ? string1 : string1.toLower() );
291  QString s2( caseSensitive ? string2 : string2.toLower() );
292 
293  if ( s1 == s2 )
294  {
295  //another trivial case, identical strings
296  return 0;
297  }
298 
299  int distance = 0;
300  const QChar *s1Char = s1.constData();
301  const QChar *s2Char = s2.constData();
302 
303  for ( int i = 0; i < string1.length(); ++i )
304  {
305  if ( *s1Char != *s2Char )
306  distance++;
307  s1Char++;
308  s2Char++;
309  }
310 
311  return distance;
312 }
313 
314 QString QgsStringUtils::soundex( const QString &string )
315 {
316  if ( string.isEmpty() )
317  return QString();
318 
319  QString tmp = string.toUpper();
320 
321  //strip non character codes, and vowel like characters after the first character
322  QChar *char1 = tmp.data();
323  QChar *char2 = tmp.data();
324  int outLen = 0;
325  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
326  {
327  if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
328  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
329  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
330  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
331  {
332  *char1 = *char2;
333  char1++;
334  outLen++;
335  }
336  }
337  tmp.truncate( outLen );
338 
339  QChar *tmpChar = tmp.data();
340  tmpChar++;
341  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
342  {
343  switch ( ( *tmpChar ).unicode() )
344  {
345  case 0x42:
346  case 0x46:
347  case 0x50:
348  case 0x56:
349  tmp.replace( i, 1, QChar( 0x31 ) );
350  break;
351 
352  case 0x43:
353  case 0x47:
354  case 0x4A:
355  case 0x4B:
356  case 0x51:
357  case 0x53:
358  case 0x58:
359  case 0x5A:
360  tmp.replace( i, 1, QChar( 0x32 ) );
361  break;
362 
363  case 0x44:
364  case 0x54:
365  tmp.replace( i, 1, QChar( 0x33 ) );
366  break;
367 
368  case 0x4C:
369  tmp.replace( i, 1, QChar( 0x34 ) );
370  break;
371 
372  case 0x4D:
373  case 0x4E:
374  tmp.replace( i, 1, QChar( 0x35 ) );
375  break;
376 
377  case 0x52:
378  tmp.replace( i, 1, QChar( 0x36 ) );
379  break;
380  }
381  }
382 
383  //remove adjacent duplicates
384  char1 = tmp.data();
385  char2 = tmp.data();
386  char2++;
387  outLen = 1;
388  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
389  {
390  if ( *char2 != *char1 )
391  {
392  char1++;
393  *char1 = *char2;
394  outLen++;
395  if ( outLen == 4 )
396  break;
397  }
398  }
399  tmp.truncate( outLen );
400  if ( tmp.length() < 4 )
401  {
402  tmp.append( "000" );
403  tmp.truncate( 4 );
404  }
405 
406  return tmp;
407 }
408 
409 QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
410 {
411  QString converted = string;
412 
413  // http://alanstorm.com/url_regex_explained
414  // note - there's more robust implementations available, but we need one which works within the limitation of QRegExp
415  static QRegExp urlRegEx( "(\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/))))" );
416  static QRegExp protoRegEx( "^(?:f|ht)tps?://" );
417  static QRegExp emailRegEx( "([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)" );
418 
419  int offset = 0;
420  bool found = false;
421  while ( urlRegEx.indexIn( converted, offset ) != -1 )
422  {
423  found = true;
424  QString url = urlRegEx.cap( 1 );
425  QString protoUrl = url;
426  if ( protoRegEx.indexIn( protoUrl ) == -1 )
427  {
428  protoUrl.prepend( "http://" );
429  }
430  QString anchor = QStringLiteral( "<a href=\"%1\">%2</a>" ).arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
431  converted.replace( urlRegEx.pos( 1 ), url.length(), anchor );
432  offset = urlRegEx.pos( 1 ) + anchor.length();
433  }
434  offset = 0;
435  while ( emailRegEx.indexIn( converted, offset ) != -1 )
436  {
437  found = true;
438  QString email = emailRegEx.cap( 1 );
439  QString anchor = QStringLiteral( "<a href=\"mailto:%1\">%1</a>" ).arg( email.toHtmlEscaped() );
440  converted.replace( emailRegEx.pos( 1 ), email.length(), anchor );
441  offset = emailRegEx.pos( 1 ) + anchor.length();
442  }
443 
444  if ( foundLinks )
445  *foundLinks = found;
446 
447  return converted;
448 }
449 
450 QString QgsStringUtils::wordWrap( const QString &string, const int length, const bool useMaxLineLength, const QString &customDelimiter )
451 {
452  if ( string.isEmpty() || length == 0 )
453  return string;
454 
455  QString newstr;
456  QRegExp rx;
457  int delimiterLength = 0;
458 
459  if ( !customDelimiter.isEmpty() )
460  {
461  rx.setPatternSyntax( QRegExp::FixedString );
462  rx.setPattern( customDelimiter );
463  delimiterLength = customDelimiter.length();
464  }
465  else
466  {
467  // \x200B is a ZERO-WIDTH SPACE, needed for worwrap to support a number of complex scripts (Indic, Arabic, etc.)
468  rx.setPattern( QStringLiteral( "[\\s\\x200B]" ) );
469  delimiterLength = 1;
470  }
471 
472  const QStringList lines = string.split( '\n' );
473  int strLength, strCurrent, strHit, lastHit;
474 
475  for ( int i = 0; i < lines.size(); i++ )
476  {
477  strLength = lines.at( i ).length();
478  strCurrent = 0;
479  strHit = 0;
480  lastHit = 0;
481 
482  while ( strCurrent < strLength )
483  {
484  // positive wrap value = desired maximum line width to wrap
485  // negative wrap value = desired minimum line width before wrap
486  if ( useMaxLineLength )
487  {
488  //first try to locate delimiter backwards
489  strHit = lines.at( i ).lastIndexOf( rx, strCurrent + length );
490  if ( strHit == lastHit || strHit == -1 )
491  {
492  //if no new backward delimiter found, try to locate forward
493  strHit = lines.at( i ).indexOf( rx, strCurrent + std::abs( length ) );
494  }
495  lastHit = strHit;
496  }
497  else
498  {
499  strHit = lines.at( i ).indexOf( rx, strCurrent + std::abs( length ) );
500  }
501  if ( strHit > -1 )
502  {
503  newstr.append( lines.at( i ).midRef( strCurrent, strHit - strCurrent ) );
504  newstr.append( '\n' );
505  strCurrent = strHit + delimiterLength;
506  }
507  else
508  {
509  newstr.append( lines.at( i ).midRef( strCurrent ) );
510  strCurrent = strLength;
511  }
512  }
513  if ( i < lines.size() - 1 )
514  newstr.append( '\n' );
515  }
516 
517  return newstr;
518 }
519 
520 QgsStringReplacement::QgsStringReplacement( const QString &match, const QString &replacement, bool caseSensitive, bool wholeWordOnly )
521  : mMatch( match )
522  , mReplacement( replacement )
523  , mCaseSensitive( caseSensitive )
524  , mWholeWordOnly( wholeWordOnly )
525 {
526  if ( mWholeWordOnly )
527  mRx = QRegExp( QString( "\\b%1\\b" ).arg( mMatch ),
528  mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
529 }
530 
531 QString QgsStringReplacement::process( const QString &input ) const
532 {
533  QString result = input;
534  if ( !mWholeWordOnly )
535  {
536  return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
537  }
538  else
539  {
540  return result.replace( mRx, mReplacement );
541  }
542 }
543 
545 {
546  QgsStringMap map;
547  map.insert( QStringLiteral( "match" ), mMatch );
548  map.insert( QStringLiteral( "replace" ), mReplacement );
549  map.insert( QStringLiteral( "caseSensitive" ), mCaseSensitive ? "1" : "0" );
550  map.insert( QStringLiteral( "wholeWord" ), mWholeWordOnly ? "1" : "0" );
551  return map;
552 }
553 
555 {
556  return QgsStringReplacement( properties.value( QStringLiteral( "match" ) ),
557  properties.value( QStringLiteral( "replace" ) ),
558  properties.value( QStringLiteral( "caseSensitive" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ),
559  properties.value( QStringLiteral( "wholeWord" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ) );
560 }
561 
562 QString QgsStringReplacementCollection::process( const QString &input ) const
563 {
564  QString result = input;
565  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
566  {
567  result = r.process( result );
568  }
569  return result;
570 }
571 
572 void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
573 {
574  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
575  {
576  QgsStringMap props = r.properties();
577  QDomElement propEl = doc.createElement( QStringLiteral( "replacement" ) );
578  QgsStringMap::const_iterator it = props.constBegin();
579  for ( ; it != props.constEnd(); ++it )
580  {
581  propEl.setAttribute( it.key(), it.value() );
582  }
583  elem.appendChild( propEl );
584  }
585 }
586 
587 void QgsStringReplacementCollection::readXml( const QDomElement &elem )
588 {
589  mReplacements.clear();
590  QDomNodeList nodelist = elem.elementsByTagName( QStringLiteral( "replacement" ) );
591  for ( int i = 0; i < nodelist.count(); i++ )
592  {
593  QDomElement replacementElem = nodelist.at( i ).toElement();
594  QDomNamedNodeMap nodeMap = replacementElem.attributes();
595 
596  QgsStringMap props;
597  for ( int j = 0; j < nodeMap.count(); ++j )
598  {
599  props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
600  }
601  mReplacements << QgsStringReplacement::fromProperties( props );
602  }
603 
604 }
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
A representation of a single string replacement.
static QString wordWrap(const QString &string, int length, bool useMaxLineLength=true, const QString &customDelimiter=QString())
Automatically wraps a string by inserting new line characters at appropriate locations in the string...
Simple title case conversion - does not fully grammatically parse the text and uses simple rules only...
Convert the string to upper camel case. Note that this method does not unaccent characters.
QMap< QString, QString > QgsStringMap
Definition: qgis.h:587
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
static QString capitalize(const QString &string, Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
Convert just the first letter of each word to uppercase, leave the rest untouched.
Convert all characters to uppercase.
Capitalization
Capitalization options.
QgsStringMap properties() const
Returns a map of the replacement properties.
static QString ampersandEncode(const QString &string)
Makes a raw string safe for inclusion as a HTML/XML string literal.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
Mixed case, ie no change.
Convert all characters to lowercase.
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ...
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.