QGIS API Documentation  3.8.0-Zanzibar (11aff65)
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 #include <QRegExp>
19 #include <QStringList>
20 #include <QTextBoundaryFinder>
21 #include <QRegularExpression>
22 #include <cstdlib> // for std::abs
23 
24 QString QgsStringUtils::capitalize( const QString &string, QgsStringUtils::Capitalization capitalization )
25 {
26  if ( string.isEmpty() )
27  return QString();
28 
29  switch ( capitalization )
30  {
31  case MixedCase:
32  return string;
33 
34  case AllUppercase:
35  return string.toUpper();
36 
37  case AllLowercase:
38  return string.toLower();
39 
41  {
42  QString temp = string;
43 
44  QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
45  QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
46 
47  wordSplitter.setPosition( 0 );
48  bool first = true;
49  while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
50  || wordSplitter.toNextBoundary() >= 0 )
51  {
52  first = false;
53  letterSplitter.setPosition( wordSplitter.position() );
54  letterSplitter.toNextBoundary();
55  QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
56  temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
57  }
58  return temp;
59  }
60 
61  case TitleCase:
62  {
63  // yes, this is MASSIVELY simplifying the problem!!
64 
65  static QStringList smallWords;
66  static QStringList newPhraseSeparators;
67  static QRegularExpression splitWords;
68  if ( smallWords.empty() )
69  {
70  smallWords = QObject::tr( "a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|s|the|to|vs.|vs|via" ).split( '|' );
71  newPhraseSeparators = QObject::tr( ".|:" ).split( '|' );
72  splitWords = QRegularExpression( QStringLiteral( "\\b" ), QRegularExpression::UseUnicodePropertiesOption );
73  }
74 
75  const QStringList parts = string.split( splitWords, QString::SkipEmptyParts );
76  QString result;
77  bool firstWord = true;
78  int i = 0;
79  int lastWord = parts.count() - 1;
80  for ( const QString &word : qgis::as_const( parts ) )
81  {
82  if ( newPhraseSeparators.contains( word.trimmed() ) )
83  {
84  firstWord = true;
85  result += word;
86  }
87  else if ( firstWord || ( i == lastWord ) || !smallWords.contains( word ) )
88  {
89  result += word.at( 0 ).toUpper() + word.mid( 1 );
90  firstWord = false;
91  }
92  else
93  {
94  result += word;
95  }
96  i++;
97  }
98  return result;
99  }
100 
101  case UpperCamelCase:
102  QString result = QgsStringUtils::capitalize( string.toLower(), QgsStringUtils::ForceFirstLetterToCapital ).simplified();
103  result.remove( ' ' );
104  return result;
105  }
106  // no warnings
107  return string;
108 }
109 
110 // original code from http://www.qtcentre.org/threads/52456-HTML-Unicode-ampersand-encoding
111 QString QgsStringUtils::ampersandEncode( const QString &string )
112 {
113  QString encoded;
114  for ( int i = 0; i < string.size(); ++i )
115  {
116  QChar ch = string.at( i );
117  if ( ch.unicode() > 160 )
118  encoded += QStringLiteral( "&#%1;" ).arg( static_cast< int >( ch.unicode() ) );
119  else if ( ch.unicode() == 38 )
120  encoded += QStringLiteral( "&amp;" );
121  else if ( ch.unicode() == 60 )
122  encoded += QStringLiteral( "&lt;" );
123  else if ( ch.unicode() == 62 )
124  encoded += QStringLiteral( "&gt;" );
125  else
126  encoded += ch;
127  }
128  return encoded;
129 }
130 
131 int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
132 {
133  int length1 = string1.length();
134  int length2 = string2.length();
135 
136  //empty strings? solution is trivial...
137  if ( string1.isEmpty() )
138  {
139  return length2;
140  }
141  else if ( string2.isEmpty() )
142  {
143  return length1;
144  }
145 
146  //handle case sensitive flag (or not)
147  QString s1( caseSensitive ? string1 : string1.toLower() );
148  QString s2( caseSensitive ? string2 : string2.toLower() );
149 
150  const QChar *s1Char = s1.constData();
151  const QChar *s2Char = s2.constData();
152 
153  //strip out any common prefix
154  int commonPrefixLen = 0;
155  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
156  {
157  commonPrefixLen++;
158  length1--;
159  length2--;
160  s1Char++;
161  s2Char++;
162  }
163 
164  //strip out any common suffix
165  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
166  {
167  length1--;
168  length2--;
169  }
170 
171  //fully checked either string? if so, the answer is easy...
172  if ( length1 == 0 )
173  {
174  return length2;
175  }
176  else if ( length2 == 0 )
177  {
178  return length1;
179  }
180 
181  //ensure the inner loop is longer
182  if ( length1 > length2 )
183  {
184  std::swap( s1, s2 );
185  std::swap( length1, length2 );
186  }
187 
188  //levenshtein algorithm begins here
189  QVector< int > col;
190  col.fill( 0, length2 + 1 );
191  QVector< int > prevCol;
192  prevCol.reserve( length2 + 1 );
193  for ( int i = 0; i < length2 + 1; ++i )
194  {
195  prevCol << i;
196  }
197  const QChar *s2start = s2Char;
198  for ( int i = 0; i < length1; ++i )
199  {
200  col[0] = i + 1;
201  s2Char = s2start;
202  for ( int j = 0; j < length2; ++j )
203  {
204  col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
205  s2Char++;
206  }
207  col.swap( prevCol );
208  s1Char++;
209  }
210  return prevCol[length2];
211 }
212 
213 QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
214 {
215  if ( string1.isEmpty() || string2.isEmpty() )
216  {
217  //empty strings, solution is trivial...
218  return QString();
219  }
220 
221  //handle case sensitive flag (or not)
222  QString s1( caseSensitive ? string1 : string1.toLower() );
223  QString s2( caseSensitive ? string2 : string2.toLower() );
224 
225  if ( s1 == s2 )
226  {
227  //another trivial case, identical strings
228  return s1;
229  }
230 
231  int *currentScores = new int [ s2.length()];
232  int *previousScores = new int [ s2.length()];
233  int maxCommonLength = 0;
234  int lastMaxBeginIndex = 0;
235 
236  const QChar *s1Char = s1.constData();
237  const QChar *s2Char = s2.constData();
238  const QChar *s2Start = s2Char;
239 
240  for ( int i = 0; i < s1.length(); ++i )
241  {
242  for ( int j = 0; j < s2.length(); ++j )
243  {
244  if ( *s1Char != *s2Char )
245  {
246  currentScores[j] = 0;
247  }
248  else
249  {
250  if ( i == 0 || j == 0 )
251  {
252  currentScores[j] = 1;
253  }
254  else
255  {
256  currentScores[j] = 1 + previousScores[j - 1];
257  }
258 
259  if ( maxCommonLength < currentScores[j] )
260  {
261  maxCommonLength = currentScores[j];
262  lastMaxBeginIndex = i;
263  }
264  }
265  s2Char++;
266  }
267  std::swap( currentScores, previousScores );
268  s1Char++;
269  s2Char = s2Start;
270  }
271  delete [] currentScores;
272  delete [] previousScores;
273  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
274 }
275 
276 int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
277 {
278  if ( string1.isEmpty() && string2.isEmpty() )
279  {
280  //empty strings, solution is trivial...
281  return 0;
282  }
283 
284  if ( string1.length() != string2.length() )
285  {
286  //invalid inputs
287  return -1;
288  }
289 
290  //handle case sensitive flag (or not)
291  QString s1( caseSensitive ? string1 : string1.toLower() );
292  QString s2( caseSensitive ? string2 : string2.toLower() );
293 
294  if ( s1 == s2 )
295  {
296  //another trivial case, identical strings
297  return 0;
298  }
299 
300  int distance = 0;
301  const QChar *s1Char = s1.constData();
302  const QChar *s2Char = s2.constData();
303 
304  for ( int i = 0; i < string1.length(); ++i )
305  {
306  if ( *s1Char != *s2Char )
307  distance++;
308  s1Char++;
309  s2Char++;
310  }
311 
312  return distance;
313 }
314 
315 QString QgsStringUtils::soundex( const QString &string )
316 {
317  if ( string.isEmpty() )
318  return QString();
319 
320  QString tmp = string.toUpper();
321 
322  //strip non character codes, and vowel like characters after the first character
323  QChar *char1 = tmp.data();
324  QChar *char2 = tmp.data();
325  int outLen = 0;
326  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
327  {
328  if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
329  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
330  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
331  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
332  {
333  *char1 = *char2;
334  char1++;
335  outLen++;
336  }
337  }
338  tmp.truncate( outLen );
339 
340  QChar *tmpChar = tmp.data();
341  tmpChar++;
342  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
343  {
344  switch ( ( *tmpChar ).unicode() )
345  {
346  case 0x42:
347  case 0x46:
348  case 0x50:
349  case 0x56:
350  tmp.replace( i, 1, QChar( 0x31 ) );
351  break;
352 
353  case 0x43:
354  case 0x47:
355  case 0x4A:
356  case 0x4B:
357  case 0x51:
358  case 0x53:
359  case 0x58:
360  case 0x5A:
361  tmp.replace( i, 1, QChar( 0x32 ) );
362  break;
363 
364  case 0x44:
365  case 0x54:
366  tmp.replace( i, 1, QChar( 0x33 ) );
367  break;
368 
369  case 0x4C:
370  tmp.replace( i, 1, QChar( 0x34 ) );
371  break;
372 
373  case 0x4D:
374  case 0x4E:
375  tmp.replace( i, 1, QChar( 0x35 ) );
376  break;
377 
378  case 0x52:
379  tmp.replace( i, 1, QChar( 0x36 ) );
380  break;
381  }
382  }
383 
384  //remove adjacent duplicates
385  char1 = tmp.data();
386  char2 = tmp.data();
387  char2++;
388  outLen = 1;
389  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
390  {
391  if ( *char2 != *char1 )
392  {
393  char1++;
394  *char1 = *char2;
395  outLen++;
396  if ( outLen == 4 )
397  break;
398  }
399  }
400  tmp.truncate( outLen );
401  if ( tmp.length() < 4 )
402  {
403  tmp.append( "000" );
404  tmp.truncate( 4 );
405  }
406 
407  return tmp;
408 }
409 
410 QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
411 {
412  QString converted = string;
413 
414  // http://alanstorm.com/url_regex_explained
415  // note - there's more robust implementations available, but we need one which works within the limitation of QRegExp
416  static QRegExp urlRegEx( "(\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/))))" );
417  static QRegExp protoRegEx( "^(?:f|ht)tps?://|file://" );
418  static QRegExp emailRegEx( "([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)" );
419 
420  int offset = 0;
421  bool found = false;
422  while ( urlRegEx.indexIn( converted, offset ) != -1 )
423  {
424  found = true;
425  QString url = urlRegEx.cap( 1 );
426  QString protoUrl = url;
427  if ( protoRegEx.indexIn( protoUrl ) == -1 )
428  {
429  protoUrl.prepend( "http://" );
430  }
431  QString anchor = QStringLiteral( "<a href=\"%1\">%2</a>" ).arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
432  converted.replace( urlRegEx.pos( 1 ), url.length(), anchor );
433  offset = urlRegEx.pos( 1 ) + anchor.length();
434  }
435  offset = 0;
436  while ( emailRegEx.indexIn( converted, offset ) != -1 )
437  {
438  found = true;
439  QString email = emailRegEx.cap( 1 );
440  QString anchor = QStringLiteral( "<a href=\"mailto:%1\">%1</a>" ).arg( email.toHtmlEscaped() );
441  converted.replace( emailRegEx.pos( 1 ), email.length(), anchor );
442  offset = emailRegEx.pos( 1 ) + anchor.length();
443  }
444 
445  if ( foundLinks )
446  *foundLinks = found;
447 
448  return converted;
449 }
450 
451 QString QgsStringUtils::wordWrap( const QString &string, const int length, const bool useMaxLineLength, const QString &customDelimiter )
452 {
453  if ( string.isEmpty() || length == 0 )
454  return string;
455 
456  QString newstr;
457  QRegExp rx;
458  int delimiterLength = 0;
459 
460  if ( !customDelimiter.isEmpty() )
461  {
462  rx.setPatternSyntax( QRegExp::FixedString );
463  rx.setPattern( customDelimiter );
464  delimiterLength = customDelimiter.length();
465  }
466  else
467  {
468  // \x200B is a ZERO-WIDTH SPACE, needed for worwrap to support a number of complex scripts (Indic, Arabic, etc.)
469  rx.setPattern( QStringLiteral( "[\\s\\x200B]" ) );
470  delimiterLength = 1;
471  }
472 
473  const QStringList lines = string.split( '\n' );
474  int strLength, strCurrent, strHit, lastHit;
475 
476  for ( int i = 0; i < lines.size(); i++ )
477  {
478  strLength = lines.at( i ).length();
479  strCurrent = 0;
480  strHit = 0;
481  lastHit = 0;
482 
483  while ( strCurrent < strLength )
484  {
485  // positive wrap value = desired maximum line width to wrap
486  // negative wrap value = desired minimum line width before wrap
487  if ( useMaxLineLength )
488  {
489  //first try to locate delimiter backwards
490  strHit = lines.at( i ).lastIndexOf( rx, strCurrent + length );
491  if ( strHit == lastHit || strHit == -1 )
492  {
493  //if no new backward delimiter found, try to locate forward
494  strHit = lines.at( i ).indexOf( rx, strCurrent + std::abs( length ) );
495  }
496  lastHit = strHit;
497  }
498  else
499  {
500  strHit = lines.at( i ).indexOf( rx, strCurrent + std::abs( length ) );
501  }
502  if ( strHit > -1 )
503  {
504  newstr.append( lines.at( i ).midRef( strCurrent, strHit - strCurrent ) );
505  newstr.append( '\n' );
506  strCurrent = strHit + delimiterLength;
507  }
508  else
509  {
510  newstr.append( lines.at( i ).midRef( strCurrent ) );
511  strCurrent = strLength;
512  }
513  }
514  if ( i < lines.size() - 1 )
515  newstr.append( '\n' );
516  }
517 
518  return newstr;
519 }
520 
521 QgsStringReplacement::QgsStringReplacement( const QString &match, const QString &replacement, bool caseSensitive, bool wholeWordOnly )
522  : mMatch( match )
523  , mReplacement( replacement )
524  , mCaseSensitive( caseSensitive )
525  , mWholeWordOnly( wholeWordOnly )
526 {
527  if ( mWholeWordOnly )
528  mRx = QRegExp( QString( "\\b%1\\b" ).arg( mMatch ),
529  mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
530 }
531 
532 QString QgsStringReplacement::process( const QString &input ) const
533 {
534  QString result = input;
535  if ( !mWholeWordOnly )
536  {
537  return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
538  }
539  else
540  {
541  return result.replace( mRx, mReplacement );
542  }
543 }
544 
546 {
547  QgsStringMap map;
548  map.insert( QStringLiteral( "match" ), mMatch );
549  map.insert( QStringLiteral( "replace" ), mReplacement );
550  map.insert( QStringLiteral( "caseSensitive" ), mCaseSensitive ? "1" : "0" );
551  map.insert( QStringLiteral( "wholeWord" ), mWholeWordOnly ? "1" : "0" );
552  return map;
553 }
554 
556 {
557  return QgsStringReplacement( properties.value( QStringLiteral( "match" ) ),
558  properties.value( QStringLiteral( "replace" ) ),
559  properties.value( QStringLiteral( "caseSensitive" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ),
560  properties.value( QStringLiteral( "wholeWord" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ) );
561 }
562 
563 QString QgsStringReplacementCollection::process( const QString &input ) const
564 {
565  QString result = input;
566  const auto constMReplacements = mReplacements;
567  for ( const QgsStringReplacement &r : constMReplacements )
568  {
569  result = r.process( result );
570  }
571  return result;
572 }
573 
574 void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
575 {
576  const auto constMReplacements = mReplacements;
577  for ( const QgsStringReplacement &r : constMReplacements )
578  {
579  QgsStringMap props = r.properties();
580  QDomElement propEl = doc.createElement( QStringLiteral( "replacement" ) );
581  QgsStringMap::const_iterator it = props.constBegin();
582  for ( ; it != props.constEnd(); ++it )
583  {
584  propEl.setAttribute( it.key(), it.value() );
585  }
586  elem.appendChild( propEl );
587  }
588 }
589 
590 void QgsStringReplacementCollection::readXml( const QDomElement &elem )
591 {
592  mReplacements.clear();
593  QDomNodeList nodelist = elem.elementsByTagName( QStringLiteral( "replacement" ) );
594  for ( int i = 0; i < nodelist.count(); i++ )
595  {
596  QDomElement replacementElem = nodelist.at( i ).toElement();
597  QDomNamedNodeMap nodeMap = replacementElem.attributes();
598 
599  QgsStringMap props;
600  for ( int j = 0; j < nodeMap.count(); ++j )
601  {
602  props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
603  }
604  mReplacements << QgsStringReplacement::fromProperties( props );
605  }
606 
607 }
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
A representation of a single string replacement.
static QString wordWrap(const QString &string, int length, bool useMaxLineLength=true, const QString &customDelimiter=QString())
Automatically wraps a string by inserting new line characters at appropriate locations in the string...
Simple title case conversion - does not fully grammatically parse the text and uses simple rules only...
Convert the string to upper camel case. Note that this method does not unaccent characters.
QMap< QString, QString > QgsStringMap
Definition: qgis.h:587
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
static QString capitalize(const QString &string, Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
Convert just the first letter of each word to uppercase, leave the rest untouched.
Convert all characters to uppercase.
Capitalization
Capitalization options.
QgsStringMap properties() const
Returns a map of the replacement properties.
static QString ampersandEncode(const QString &string)
Makes a raw string safe for inclusion as a HTML/XML string literal.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
Mixed case, ie no change.
Convert all characters to lowercase.
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ...
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.