TextSimilarityService
in package
Table of Contents
Constants
- CACHE_SIZE_LIMIT = 1000
- DEFAULT_NGRAM_SIZE = 2
- SIMILARITY_THRESHOLDS = ['EXACT' => 100.0, 'VERY_HIGH' => 95.0, 'HIGH' => 85.0, 'MEDIUM' => 70.0, 'LOW' => 50.0]
Properties
- $ngramCache : array<string|int, mixed>
- $scriptCache : array<string|int, mixed>
- $transliterationCache : array<string|int, mixed>
Methods
- calculateBatchSimilarity() : array<string|int, mixed>
- Batch similarity calculation for multiple comparisons
- calculateInternationalNameSimilarity() : array<string|int, mixed>
- calculateNGramSimilarity() : float
- calculateSimilarity() : float
- Calculate the best similarity score between two texts using multiple algorithms.
- calculateWeightedAverage() : float
- clearCaches() : void
- extractInternationalTokens() : array<string|int, mixed>
- findBestMatches() : array<string|int, mixed>
- Find the best matches above a threshold
- getBestInternationalSimilarity() : float
- getSimilarityLevel() : string
- Get similarity confidence level
- getUnicodeScripts() : array<string|int, mixed>
- hasNonLatinScript() : bool
- normalizeInternationalText() : string
- transliterateToLatin() : string
- cacheResult() : mixed
- cacheScript() : array<string|int, mixed>
- cacheTransliteration() : string
- calculateRomanizationSimilarity() : float
- calculateTokenSimilarity() : float
- calculateTransliterationSimilarity() : float
- calculateUnicodeLevenshtein() : float
- calculateUnicodeScriptSimilarity() : float
- extractNGrams() : array<string|int, mixed>
- fallbackTransliteration() : string
- longestCommonSubsequenceSimilarity() : float
- makeCacheKey() : string
- performTransliteration() : string
- unicodeLevenshteinDistance() : int
- unicodeLongestCommonSubsequence() : int
Constants
CACHE_SIZE_LIMIT
private
mixed
CACHE_SIZE_LIMIT
= 1000
DEFAULT_NGRAM_SIZE
private
mixed
DEFAULT_NGRAM_SIZE
= 2
SIMILARITY_THRESHOLDS
private
mixed
SIMILARITY_THRESHOLDS
= ['EXACT' => 100.0, 'VERY_HIGH' => 95.0, 'HIGH' => 85.0, 'MEDIUM' => 70.0, 'LOW' => 50.0]
Properties
$ngramCache
private
array<string|int, mixed>
$ngramCache
= []
$scriptCache
private
array<string|int, mixed>
$scriptCache
= []
$transliterationCache
private
array<string|int, mixed>
$transliterationCache
= []
Methods
calculateBatchSimilarity()
Batch similarity calculation for multiple comparisons
public
calculateBatchSimilarity(string $target, array<string|int, mixed> $candidates) : array<string|int, mixed>
Parameters
- $target : string
- $candidates : array<string|int, mixed>
Return values
array<string|int, mixed>calculateInternationalNameSimilarity()
public
calculateInternationalNameSimilarity(string $name1, string $name2) : array<string|int, mixed>
Parameters
- $name1 : string
- $name2 : string
Return values
array<string|int, mixed>calculateNGramSimilarity()
public
calculateNGramSimilarity(string $str1, string $str2[, int $n = self::DEFAULT_NGRAM_SIZE ]) : float
Parameters
- $str1 : string
- $str2 : string
- $n : int = self::DEFAULT_NGRAM_SIZE
Return values
floatcalculateSimilarity()
Calculate the best similarity score between two texts using multiple algorithms.
public
calculateSimilarity(string $text1, string $text2) : float
Parameters
- $text1 : string
- $text2 : string
Return values
floatcalculateWeightedAverage()
public
calculateWeightedAverage(array<string|int, mixed> $scores, array<string|int, mixed> $weights) : float
Parameters
- $scores : array<string|int, mixed>
- $weights : array<string|int, mixed>
Return values
floatclearCaches()
public
clearCaches() : void
extractInternationalTokens()
public
extractInternationalTokens(string $text) : array<string|int, mixed>
Parameters
- $text : string
Return values
array<string|int, mixed>findBestMatches()
Find the best matches above a threshold
public
findBestMatches(string $target, array<string|int, mixed> $candidates[, float $threshold = 70.0 ][, int $limit = 5 ]) : array<string|int, mixed>
Parameters
- $target : string
- $candidates : array<string|int, mixed>
- $threshold : float = 70.0
- $limit : int = 5
Return values
array<string|int, mixed>getBestInternationalSimilarity()
public
getBestInternationalSimilarity(string $name1, string $name2) : float
Parameters
- $name1 : string
- $name2 : string
Return values
floatgetSimilarityLevel()
Get similarity confidence level
public
getSimilarityLevel(float $score) : string
Parameters
- $score : float
Return values
stringgetUnicodeScripts()
public
getUnicodeScripts(string $text) : array<string|int, mixed>
Parameters
- $text : string
Return values
array<string|int, mixed>hasNonLatinScript()
public
hasNonLatinScript(string $text) : bool
Parameters
- $text : string
Return values
boolnormalizeInternationalText()
public
normalizeInternationalText(string $text) : string
Parameters
- $text : string
Return values
stringtransliterateToLatin()
public
transliterateToLatin(string $text) : string
Parameters
- $text : string
Return values
stringcacheResult()
private
cacheResult(string $key, mixed $result) : mixed
Parameters
- $key : string
- $result : mixed
cacheScript()
private
cacheScript(string $key, array<string|int, mixed> $result) : array<string|int, mixed>
Parameters
- $key : string
- $result : array<string|int, mixed>
Return values
array<string|int, mixed>cacheTransliteration()
private
cacheTransliteration(string $key, string $result) : string
Parameters
- $key : string
- $result : string
Return values
stringcalculateRomanizationSimilarity()
private
calculateRomanizationSimilarity(string $name1, string $name2) : float
Parameters
- $name1 : string
- $name2 : string
Return values
floatcalculateTokenSimilarity()
private
calculateTokenSimilarity(string $name1, string $name2) : float
Parameters
- $name1 : string
- $name2 : string
Return values
floatcalculateTransliterationSimilarity()
private
calculateTransliterationSimilarity(string $name1, string $name2) : float
Parameters
- $name1 : string
- $name2 : string
Return values
floatcalculateUnicodeLevenshtein()
private
calculateUnicodeLevenshtein(string $str1, string $str2, int $maxLen) : float
Parameters
- $str1 : string
- $str2 : string
- $maxLen : int
Return values
floatcalculateUnicodeScriptSimilarity()
private
calculateUnicodeScriptSimilarity(string $name1, string $name2) : float
Parameters
- $name1 : string
- $name2 : string
Return values
floatextractNGrams()
private
extractNGrams(string $text, int $n) : array<string|int, mixed>
Parameters
- $text : string
- $n : int
Return values
array<string|int, mixed>fallbackTransliteration()
private
fallbackTransliteration(string $text) : string
Parameters
- $text : string
Return values
stringlongestCommonSubsequenceSimilarity()
private
longestCommonSubsequenceSimilarity(string $str1, string $str2) : float
Parameters
- $str1 : string
- $str2 : string
Return values
floatmakeCacheKey()
private
makeCacheKey(string $str1, string $str2) : string
Parameters
- $str1 : string
- $str2 : string
Return values
stringperformTransliteration()
private
performTransliteration(string $text) : string
Parameters
- $text : string
Return values
stringunicodeLevenshteinDistance()
private
unicodeLevenshteinDistance(string $str1, string $str2) : int
Parameters
- $str1 : string
- $str2 : string
Return values
intunicodeLongestCommonSubsequence()
private
unicodeLongestCommonSubsequence(string $str1, string $str2) : int
Parameters
- $str1 : string
- $str2 : string