QGIS API Documentation  3.4.15-Madeira (e83d02e274)
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 #include <QRegExp>
19 #include <QStringList>
20 #include <QTextBoundaryFinder>
21 #include <QRegularExpression>
22 #include <cstdlib> // for std::abs
23 
24 QString QgsStringUtils::capitalize( const QString &string, QgsStringUtils::Capitalization capitalization )
25 {
26  if ( string.isEmpty() )
27  return QString();
28 
29  switch ( capitalization )
30  {
31  case MixedCase:
32  return string;
33 
34  case AllUppercase:
35  return string.toUpper();
36 
37  case AllLowercase:
38  return string.toLower();
39 
41  {
42  QString temp = string;
43 
44  QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
45  QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
46 
47  wordSplitter.setPosition( 0 );
48  bool first = true;
49  while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
50  || wordSplitter.toNextBoundary() >= 0 )
51  {
52  first = false;
53  letterSplitter.setPosition( wordSplitter.position() );
54  letterSplitter.toNextBoundary();
55  QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
56  temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
57  }
58  return temp;
59  }
60 
61  case TitleCase:
62  {
63  // yes, this is MASSIVELY simplifying the problem!!
64 
65  static QStringList smallWords;
66  static QStringList newPhraseSeparators;
67  static QRegularExpression splitWords;
68  if ( smallWords.empty() )
69  {
70  smallWords = QObject::tr( "a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|s|the|to|vs.|vs|via" ).split( '|' );
71  newPhraseSeparators = QObject::tr( ".|:" ).split( '|' );
72  splitWords = QRegularExpression( QStringLiteral( "\\b" ), QRegularExpression::UseUnicodePropertiesOption );
73  }
74 
75  const QStringList parts = string.split( splitWords, QString::SkipEmptyParts );
76  QString result;
77  bool firstWord = true;
78  int i = 0;
79  int lastWord = parts.count() - 1;
80  for ( const QString &word : qgis::as_const( parts ) )
81  {
82  if ( newPhraseSeparators.contains( word.trimmed() ) )
83  {
84  firstWord = true;
85  result += word;
86  }
87  else if ( firstWord || ( i == lastWord ) || !smallWords.contains( word ) )
88  {
89  result += word.at( 0 ).toUpper() + word.mid( 1 );
90  firstWord = false;
91  }
92  else
93  {
94  result += word;
95  }
96  i++;
97  }
98  return result;
99  }
100  }
101  // no warnings
102  return string;
103 }
104 
105 // original code from http://www.qtcentre.org/threads/52456-HTML-Unicode-ampersand-encoding
106 QString QgsStringUtils::ampersandEncode( const QString &string )
107 {
108  QString encoded;
109  for ( int i = 0; i < string.size(); ++i )
110  {
111  QChar ch = string.at( i );
112  if ( ch.unicode() > 160 )
113  encoded += QStringLiteral( "&#%1;" ).arg( static_cast< int >( ch.unicode() ) );
114  else if ( ch.unicode() == 38 )
115  encoded += QStringLiteral( "&amp;" );
116  else if ( ch.unicode() == 60 )
117  encoded += QStringLiteral( "&lt;" );
118  else if ( ch.unicode() == 62 )
119  encoded += QStringLiteral( "&gt;" );
120  else
121  encoded += ch;
122  }
123  return encoded;
124 }
125 
126 int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
127 {
128  int length1 = string1.length();
129  int length2 = string2.length();
130 
131  //empty strings? solution is trivial...
132  if ( string1.isEmpty() )
133  {
134  return length2;
135  }
136  else if ( string2.isEmpty() )
137  {
138  return length1;
139  }
140 
141  //handle case sensitive flag (or not)
142  QString s1( caseSensitive ? string1 : string1.toLower() );
143  QString s2( caseSensitive ? string2 : string2.toLower() );
144 
145  const QChar *s1Char = s1.constData();
146  const QChar *s2Char = s2.constData();
147 
148  //strip out any common prefix
149  int commonPrefixLen = 0;
150  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
151  {
152  commonPrefixLen++;
153  length1--;
154  length2--;
155  s1Char++;
156  s2Char++;
157  }
158 
159  //strip out any common suffix
160  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
161  {
162  length1--;
163  length2--;
164  }
165 
166  //fully checked either string? if so, the answer is easy...
167  if ( length1 == 0 )
168  {
169  return length2;
170  }
171  else if ( length2 == 0 )
172  {
173  return length1;
174  }
175 
176  //ensure the inner loop is longer
177  if ( length1 > length2 )
178  {
179  std::swap( s1, s2 );
180  std::swap( length1, length2 );
181  }
182 
183  //levenshtein algorithm begins here
184  QVector< int > col;
185  col.fill( 0, length2 + 1 );
186  QVector< int > prevCol;
187  prevCol.reserve( length2 + 1 );
188  for ( int i = 0; i < length2 + 1; ++i )
189  {
190  prevCol << i;
191  }
192  const QChar *s2start = s2Char;
193  for ( int i = 0; i < length1; ++i )
194  {
195  col[0] = i + 1;
196  s2Char = s2start;
197  for ( int j = 0; j < length2; ++j )
198  {
199  col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
200  s2Char++;
201  }
202  col.swap( prevCol );
203  s1Char++;
204  }
205  return prevCol[length2];
206 }
207 
208 QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
209 {
210  if ( string1.isEmpty() || string2.isEmpty() )
211  {
212  //empty strings, solution is trivial...
213  return QString();
214  }
215 
216  //handle case sensitive flag (or not)
217  QString s1( caseSensitive ? string1 : string1.toLower() );
218  QString s2( caseSensitive ? string2 : string2.toLower() );
219 
220  if ( s1 == s2 )
221  {
222  //another trivial case, identical strings
223  return s1;
224  }
225 
226  int *currentScores = new int [ s2.length()];
227  int *previousScores = new int [ s2.length()];
228  int maxCommonLength = 0;
229  int lastMaxBeginIndex = 0;
230 
231  const QChar *s1Char = s1.constData();
232  const QChar *s2Char = s2.constData();
233  const QChar *s2Start = s2Char;
234 
235  for ( int i = 0; i < s1.length(); ++i )
236  {
237  for ( int j = 0; j < s2.length(); ++j )
238  {
239  if ( *s1Char != *s2Char )
240  {
241  currentScores[j] = 0;
242  }
243  else
244  {
245  if ( i == 0 || j == 0 )
246  {
247  currentScores[j] = 1;
248  }
249  else
250  {
251  currentScores[j] = 1 + previousScores[j - 1];
252  }
253 
254  if ( maxCommonLength < currentScores[j] )
255  {
256  maxCommonLength = currentScores[j];
257  lastMaxBeginIndex = i;
258  }
259  }
260  s2Char++;
261  }
262  std::swap( currentScores, previousScores );
263  s1Char++;
264  s2Char = s2Start;
265  }
266  delete [] currentScores;
267  delete [] previousScores;
268  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
269 }
270 
271 int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
272 {
273  if ( string1.isEmpty() && string2.isEmpty() )
274  {
275  //empty strings, solution is trivial...
276  return 0;
277  }
278 
279  if ( string1.length() != string2.length() )
280  {
281  //invalid inputs
282  return -1;
283  }
284 
285  //handle case sensitive flag (or not)
286  QString s1( caseSensitive ? string1 : string1.toLower() );
287  QString s2( caseSensitive ? string2 : string2.toLower() );
288 
289  if ( s1 == s2 )
290  {
291  //another trivial case, identical strings
292  return 0;
293  }
294 
295  int distance = 0;
296  const QChar *s1Char = s1.constData();
297  const QChar *s2Char = s2.constData();
298 
299  for ( int i = 0; i < string1.length(); ++i )
300  {
301  if ( *s1Char != *s2Char )
302  distance++;
303  s1Char++;
304  s2Char++;
305  }
306 
307  return distance;
308 }
309 
310 QString QgsStringUtils::soundex( const QString &string )
311 {
312  if ( string.isEmpty() )
313  return QString();
314 
315  QString tmp = string.toUpper();
316 
317  //strip non character codes, and vowel like characters after the first character
318  QChar *char1 = tmp.data();
319  QChar *char2 = tmp.data();
320  int outLen = 0;
321  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
322  {
323  if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
324  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
325  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
326  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
327  {
328  *char1 = *char2;
329  char1++;
330  outLen++;
331  }
332  }
333  tmp.truncate( outLen );
334 
335  QChar *tmpChar = tmp.data();
336  tmpChar++;
337  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
338  {
339  switch ( ( *tmpChar ).unicode() )
340  {
341  case 0x42:
342  case 0x46:
343  case 0x50:
344  case 0x56:
345  tmp.replace( i, 1, QChar( 0x31 ) );
346  break;
347 
348  case 0x43:
349  case 0x47:
350  case 0x4A:
351  case 0x4B:
352  case 0x51:
353  case 0x53:
354  case 0x58:
355  case 0x5A:
356  tmp.replace( i, 1, QChar( 0x32 ) );
357  break;
358 
359  case 0x44:
360  case 0x54:
361  tmp.replace( i, 1, QChar( 0x33 ) );
362  break;
363 
364  case 0x4C:
365  tmp.replace( i, 1, QChar( 0x34 ) );
366  break;
367 
368  case 0x4D:
369  case 0x4E:
370  tmp.replace( i, 1, QChar( 0x35 ) );
371  break;
372 
373  case 0x52:
374  tmp.replace( i, 1, QChar( 0x36 ) );
375  break;
376  }
377  }
378 
379  //remove adjacent duplicates
380  char1 = tmp.data();
381  char2 = tmp.data();
382  char2++;
383  outLen = 1;
384  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
385  {
386  if ( *char2 != *char1 )
387  {
388  char1++;
389  *char1 = *char2;
390  outLen++;
391  if ( outLen == 4 )
392  break;
393  }
394  }
395  tmp.truncate( outLen );
396  if ( tmp.length() < 4 )
397  {
398  tmp.append( "000" );
399  tmp.truncate( 4 );
400  }
401 
402  return tmp;
403 }
404 
405 QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
406 {
407  QString converted = string;
408 
409  // http://alanstorm.com/url_regex_explained
410  // note - there's more robust implementations available, but we need one which works within the limitation of QRegExp
411  static QRegExp urlRegEx( "(\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/))))" );
412  static QRegExp protoRegEx( "^(?:f|ht)tps?://|file://" );
413  static QRegExp emailRegEx( "([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)" );
414 
415  int offset = 0;
416  bool found = false;
417  while ( urlRegEx.indexIn( converted, offset ) != -1 )
418  {
419  found = true;
420  QString url = urlRegEx.cap( 1 );
421  QString protoUrl = url;
422  if ( protoRegEx.indexIn( protoUrl ) == -1 )
423  {
424  protoUrl.prepend( "http://" );
425  }
426  QString anchor = QStringLiteral( "<a href=\"%1\">%2</a>" ).arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
427  converted.replace( urlRegEx.pos( 1 ), url.length(), anchor );
428  offset = urlRegEx.pos( 1 ) + anchor.length();
429  }
430  offset = 0;
431  while ( emailRegEx.indexIn( converted, offset ) != -1 )
432  {
433  found = true;
434  QString email = emailRegEx.cap( 1 );
435  QString anchor = QStringLiteral( "<a href=\"mailto:%1\">%1</a>" ).arg( email.toHtmlEscaped() );
436  converted.replace( emailRegEx.pos( 1 ), email.length(), anchor );
437  offset = emailRegEx.pos( 1 ) + anchor.length();
438  }
439 
440  if ( foundLinks )
441  *foundLinks = found;
442 
443  return converted;
444 }
445 
446 QString QgsStringUtils::wordWrap( const QString &string, const int length, const bool useMaxLineLength, const QString &customDelimiter )
447 {
448  if ( string.isEmpty() || length == 0 )
449  return string;
450 
451  QString newstr;
452  QRegExp rx;
453  int delimiterLength = 0;
454 
455  if ( !customDelimiter.isEmpty() )
456  {
457  rx.setPatternSyntax( QRegExp::FixedString );
458  rx.setPattern( customDelimiter );
459  delimiterLength = customDelimiter.length();
460  }
461  else
462  {
463  // \x200B is a ZERO-WIDTH SPACE, needed for worwrap to support a number of complex scripts (Indic, Arabic, etc.)
464  rx.setPattern( QStringLiteral( "[\\s\\x200B]" ) );
465  delimiterLength = 1;
466  }
467 
468  const QStringList lines = string.split( '\n' );
469  int strLength, strCurrent, strHit, lastHit;
470 
471  for ( int i = 0; i < lines.size(); i++ )
472  {
473  strLength = lines.at( i ).length();
474  strCurrent = 0;
475  strHit = 0;
476  lastHit = 0;
477 
478  while ( strCurrent < strLength )
479  {
480  // positive wrap value = desired maximum line width to wrap
481  // negative wrap value = desired minimum line width before wrap
482  if ( useMaxLineLength )
483  {
484  //first try to locate delimiter backwards
485  strHit = lines.at( i ).lastIndexOf( rx, strCurrent + length );
486  if ( strHit == lastHit || strHit == -1 )
487  {
488  //if no new backward delimiter found, try to locate forward
489  strHit = lines.at( i ).indexOf( rx, strCurrent + std::abs( length ) );
490  }
491  lastHit = strHit;
492  }
493  else
494  {
495  strHit = lines.at( i ).indexOf( rx, strCurrent + std::abs( length ) );
496  }
497  if ( strHit > -1 )
498  {
499  newstr.append( lines.at( i ).midRef( strCurrent, strHit - strCurrent ) );
500  newstr.append( '\n' );
501  strCurrent = strHit + delimiterLength;
502  }
503  else
504  {
505  newstr.append( lines.at( i ).midRef( strCurrent ) );
506  strCurrent = strLength;
507  }
508  }
509  if ( i < lines.size() - 1 )
510  newstr.append( '\n' );
511  }
512 
513  return newstr;
514 }
515 
516 QgsStringReplacement::QgsStringReplacement( const QString &match, const QString &replacement, bool caseSensitive, bool wholeWordOnly )
517  : mMatch( match )
518  , mReplacement( replacement )
519  , mCaseSensitive( caseSensitive )
520  , mWholeWordOnly( wholeWordOnly )
521 {
522  if ( mWholeWordOnly )
523  mRx = QRegExp( QString( "\\b%1\\b" ).arg( mMatch ),
524  mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
525 }
526 
527 QString QgsStringReplacement::process( const QString &input ) const
528 {
529  QString result = input;
530  if ( !mWholeWordOnly )
531  {
532  return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
533  }
534  else
535  {
536  return result.replace( mRx, mReplacement );
537  }
538 }
539 
541 {
542  QgsStringMap map;
543  map.insert( QStringLiteral( "match" ), mMatch );
544  map.insert( QStringLiteral( "replace" ), mReplacement );
545  map.insert( QStringLiteral( "caseSensitive" ), mCaseSensitive ? "1" : "0" );
546  map.insert( QStringLiteral( "wholeWord" ), mWholeWordOnly ? "1" : "0" );
547  return map;
548 }
549 
551 {
552  return QgsStringReplacement( properties.value( QStringLiteral( "match" ) ),
553  properties.value( QStringLiteral( "replace" ) ),
554  properties.value( QStringLiteral( "caseSensitive" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ),
555  properties.value( QStringLiteral( "wholeWord" ), QStringLiteral( "0" ) ) == QLatin1String( "1" ) );
556 }
557 
558 QString QgsStringReplacementCollection::process( const QString &input ) const
559 {
560  QString result = input;
561  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
562  {
563  result = r.process( result );
564  }
565  return result;
566 }
567 
568 void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
569 {
570  Q_FOREACH ( const QgsStringReplacement &r, mReplacements )
571  {
572  QgsStringMap props = r.properties();
573  QDomElement propEl = doc.createElement( QStringLiteral( "replacement" ) );
574  QgsStringMap::const_iterator it = props.constBegin();
575  for ( ; it != props.constEnd(); ++it )
576  {
577  propEl.setAttribute( it.key(), it.value() );
578  }
579  elem.appendChild( propEl );
580  }
581 }
582 
583 void QgsStringReplacementCollection::readXml( const QDomElement &elem )
584 {
585  mReplacements.clear();
586  QDomNodeList nodelist = elem.elementsByTagName( QStringLiteral( "replacement" ) );
587  for ( int i = 0; i < nodelist.count(); i++ )
588  {
589  QDomElement replacementElem = nodelist.at( i ).toElement();
590  QDomNamedNodeMap nodeMap = replacementElem.attributes();
591 
592  QgsStringMap props;
593  for ( int j = 0; j < nodeMap.count(); ++j )
594  {
595  props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
596  }
597  mReplacements << QgsStringReplacement::fromProperties( props );
598  }
599 
600 }
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
A representation of a single string replacement.
static QString wordWrap(const QString &string, int length, bool useMaxLineLength=true, const QString &customDelimiter=QString())
Automatically wraps a string by inserting new line characters at appropriate locations in the string...
Simple title case conversion - does not fully grammatically parse the text and uses simple rules only...
QMap< QString, QString > QgsStringMap
Definition: qgis.h:577
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
static QString capitalize(const QString &string, Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
Convert just the first letter of each word to uppercase, leave the rest untouched.
Convert all characters to uppercase.
Capitalization
Capitalization options.
static QString ampersandEncode(const QString &string)
Makes a raw string safe for inclusion as a HTML/XML string literal.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
Mixed case, ie no change.
Convert all characters to lowercase.
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ...
QgsStringMap properties() const
Returns a map of the replacement properties.
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.