Skip to content

Instantly share code, notes, and snippets.

@Tkachenko-Ivan
Last active March 10, 2024 19:10
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save Tkachenko-Ivan/354db31938f7ed4218ac5d20c7f9502b to your computer and use it in GitHub Desktop.
Save Tkachenko-Ivan/354db31938f7ed4218ac5d20c7f9502b to your computer and use it in GitHub Desktop.
Пример конфигурационного файла Sphinx Search или Manticore Search (совместимы), для поддержки фонетических алгоритмов: Soundex, улучшенный Soundex, NYSIIS, Daitch-Mokotoff Soundex, Caverphone, Metaphone, русский Metaphone
searchd
{
listen = 9306:mysql41
listen = /var/run/mysqld/mysqld.sock:mysql41
listen = 9308:http
log = /var/log/manticore/searchd.log
query_log = /var/log/manticore/query.log
binlog_path = /var/log/manticore
pid_file = /var/run/manticore/searchd.pid
mysql_version_string = 5.0.0
}
source base
{
type = pgsql
sql_host = db
sql_db = postgres
sql_user = postgres
sql_pass = StreetPass
sql_port = 5432
}
source STREETS: base
{
sql_field_string = offname
sql_attr_string = aoguid
sql_attr_string = shortname
sql_query = select id, aoguid, shortname, offname from fias_addrobj
}
index simple_soundex
{
# Транслитерация
regexp_filter = (А|а) => a
regexp_filter = (Б|б) => b
regexp_filter = (В|в) => v
regexp_filter = (Г|г) => g
regexp_filter = (Д|д) => d
regexp_filter = (Е|е) => e
regexp_filter = (Ё|ё) => jo
regexp_filter = (Ж|ж) => zh
regexp_filter = (З|з) => z
regexp_filter = (И|и) => i
regexp_filter = (Й|й) => jj
regexp_filter = (К|к) => k
regexp_filter = (Л|л) => l
regexp_filter = (М|м) => m
regexp_filter = (Н|н) => n
regexp_filter = (О|о) => o
regexp_filter = (П|п) => p
regexp_filter = (Р|р) => r
regexp_filter = (С|с) => s
regexp_filter = (Т|т) => t
regexp_filter = (У|у) => u
regexp_filter = (Ф|ф) => f
regexp_filter = (Х|х) => kh
regexp_filter = (Ц|ц) => c
regexp_filter = (Ч|ч) => ch
regexp_filter = (Ш|ш) => sh
regexp_filter = (Щ|щ) => shh
regexp_filter = (Ъ|ъ) =>
regexp_filter = (Ы|ы) => y
regexp_filter = (Ь|ь) =>
regexp_filter = (Э|э) => eh
regexp_filter = (Ю|ю) => ju
regexp_filter = (Я|я) => ja
morphology = soundex
# Настройки индекса
min_infix_len = 2
source = STREETS
path = /var/lib/manticore/SimpleSoundex
}
index refined_soundex
{
# Транслитерация
regexp_filter = (А|а) => a
regexp_filter = (Б|б) => b
regexp_filter = (В|в) => v
regexp_filter = (Г|г) => g
regexp_filter = (Д|д) => d
regexp_filter = (Е|е) => e
regexp_filter = (Ё|ё) => jo
regexp_filter = (Ж|ж) => zh
regexp_filter = (З|з) => z
regexp_filter = (И|и) => i
regexp_filter = (Й|й) => jj
regexp_filter = (К|к) => k
regexp_filter = (Л|л) => l
regexp_filter = (М|м) => m
regexp_filter = (Н|н) => n
regexp_filter = (О|о) => o
regexp_filter = (П|п) => p
regexp_filter = (Р|р) => r
regexp_filter = (С|с) => s
regexp_filter = (Т|т) => t
regexp_filter = (У|у) => u
regexp_filter = (Ф|ф) => f
regexp_filter = (Х|х) => kh
regexp_filter = (Ц|ц) => c
regexp_filter = (Ч|ч) => ch
regexp_filter = (Ш|ш) => sh
regexp_filter = (Щ|щ) => shh
regexp_filter = (Ъ|ъ) =>
regexp_filter = (Ы|ы) => y
regexp_filter = (Ь|ь) =>
regexp_filter = (Э|э) => eh
regexp_filter = (Ю|ю) => ju
regexp_filter = (Я|я) => ja
# Регулярные выражения для обработки первого символа
regexp_filter = \b(A|a) => a
regexp_filter = \b(B|b) => b
regexp_filter = \b(C|c) => c
regexp_filter = \b(D|d) => d
regexp_filter = \b(E|e) => e
regexp_filter = \b(F|f) => f
regexp_filter = \b(G|g) => g
regexp_filter = \b(H|h) => h
regexp_filter = \b(I|i) => i
regexp_filter = \b(J|j) => j
regexp_filter = \b(K|k) => k
regexp_filter = \b(L|l) => l
regexp_filter = \b(M|m) => m
regexp_filter = \b(N|n) => n
regexp_filter = \b(O|o) => o
regexp_filter = \b(P|p) => p
regexp_filter = \b(Q|q) => q
regexp_filter = \b(R|r) => r
regexp_filter = \b(S|s) => s
regexp_filter = \b(T|t) => t
regexp_filter = \b(U|u) => u
regexp_filter = \b(V|v) => v
regexp_filter = \b(W|w) => w
regexp_filter = \b(X|x) => x
regexp_filter = \b(Y|y) => y
regexp_filter = \b(Z|z) => z
# Регулярные выражения для обработки НЕ первого символа
regexp_filter = \B(A|a) =>
regexp_filter = \B(B|b) => 1
regexp_filter = \B(C|c) => 3
regexp_filter = \B(D|d) => 6
regexp_filter = \B(E|e) =>
regexp_filter = \B(F|f) => 2
regexp_filter = \B(G|g) => 4
regexp_filter = \B(H|h) =>
regexp_filter = \B(I|i) =>
regexp_filter = \B(J|j) => 4
regexp_filter = \B(K|k) => 3
regexp_filter = \B(L|l) => 7
regexp_filter = \B(M|m) => 8
regexp_filter = \B(N|n) => 8
regexp_filter = \B(O|o) =>
regexp_filter = \B(P|p) => 1
regexp_filter = \B(Q|q) => 5
regexp_filter = \B(R|r) => 9
regexp_filter = \B(S|s) => 3
regexp_filter = \B(T|t) => 6
regexp_filter = \B(U|u) =>
regexp_filter = \B(V|v) => 2
regexp_filter = \B(W|w) =>
regexp_filter = \B(X|x) => 5
regexp_filter = \B(Y|y) =>
regexp_filter = \B(Z|z) => 5
# Идущие подряд сливаем
regexp_filter = 0+ => 0
regexp_filter = 1+ => 1
regexp_filter = 2+ => 2
regexp_filter = 3+ => 3
regexp_filter = 4+ => 4
regexp_filter = 5+ => 5
regexp_filter = 6+ => 6
regexp_filter = 7+ => 7
regexp_filter = 8+ => 8
regexp_filter = 9+ => 9
# Настройки индекса
min_infix_len = 2
source = STREETS
path = /var/lib/manticore/RefinedSoundex
}
index nysiis
{
# Транслитерация
regexp_filter = (А|а) => a
regexp_filter = (Б|б) => b
regexp_filter = (В|в) => v
regexp_filter = (Г|г) => g
regexp_filter = (Д|д) => d
regexp_filter = (Е|е) => e
regexp_filter = (Ё|ё) => jo
regexp_filter = (Ж|ж) => zh
regexp_filter = (З|з) => z
regexp_filter = (И|и) => i
regexp_filter = (Й|й) => jj
regexp_filter = (К|к) => k
regexp_filter = (Л|л) => l
regexp_filter = (М|м) => m
regexp_filter = (Н|н) => n
regexp_filter = (О|о) => o
regexp_filter = (П|п) => p
regexp_filter = (Р|р) => r
regexp_filter = (С|с) => s
regexp_filter = (Т|т) => t
regexp_filter = (У|у) => u
regexp_filter = (Ф|ф) => f
regexp_filter = (Х|х) => kh
regexp_filter = (Ц|ц) => c
regexp_filter = (Ч|ч) => ch
regexp_filter = (Ш|ш) => sh
regexp_filter = (Щ|щ) => shh
regexp_filter = (Ъ|ъ) =>
regexp_filter = (Ы|ы) => y
regexp_filter = (Ь|ь) =>
regexp_filter = (Э|э) => eh
regexp_filter = (Ю|ю) => ju
regexp_filter = (Я|я) => ja
# Преобразовать начало слова
regexp_filter = (?i)\b(mac) => mcc
regexp_filter = (?i)\b(kn) => n
regexp_filter = (?i)\b(k) => c
regexp_filter = (?i)\b(ph|pf) => ff
regexp_filter = (?i)\b(sch) => sss
# Преобразовать конец слова
regexp_filter = (?i)(ee)\b => y
regexp_filter = (?i)(ie)\b => y
regexp_filter = (?i)(dt|rt|rd|nt|nd)\b => d
# После гласных: удалить H, преобразовать W → A
regexp_filter = (?i)(a|e|i|o|u|y)h => \1
regexp_filter = (?i)(a|e|i|o|u|y)w => \1a
# Преобразуем все буквы кроме первой
regexp_filter = (?i)\B(ev) => af
regexp_filter = (?i)\B(e|i|o|u) => a
regexp_filter = (?i)\B(q) => g
regexp_filter = (?i)\B(z) => s
regexp_filter = (?i)\B(m) => n
regexp_filter = (?i)\B(kn) => n
regexp_filter = (?i)\B(k) => c
regexp_filter = (?i)\B(sch) => sss
regexp_filter = (?i)\B(ph) => ff
# Удалить S на конце
regexp_filter = (?i)s\b =>
# Преобразовать AY на конце → Y
regexp_filter = (?i)(ay)\b => y
# Удалить A на конце
regexp_filter = (?i)a\b =>
# Настройки индекса
min_infix_len = 2
source = STREETS
path = /var/lib/manticore/NYSIIS
}
index daitch_mokotoff_soundex
{
# Транслитерация
regexp_filter = (А|а) => a
regexp_filter = (Б|б) => b
regexp_filter = (В|в) => v
regexp_filter = (Г|г) => g
regexp_filter = (Д|д) => d
regexp_filter = (Е|е) => e
regexp_filter = (Ё|ё) => jo
regexp_filter = (Ж|ж) => zh
regexp_filter = (З|з) => z
regexp_filter = (И|и) => i
regexp_filter = (Й|й) => jj
regexp_filter = (К|к) => k
regexp_filter = (Л|л) => l
regexp_filter = (М|м) => m
regexp_filter = (Н|н) => n
regexp_filter = (О|о) => o
regexp_filter = (П|п) => p
regexp_filter = (Р|р) => r
regexp_filter = (С|с) => s
regexp_filter = (Т|т) => t
regexp_filter = (У|у) => u
regexp_filter = (Ф|ф) => f
regexp_filter = (Х|х) => kh
regexp_filter = (Ц|ц) => c
regexp_filter = (Ч|ч) => ch
regexp_filter = (Ш|ш) => sh
regexp_filter = (Щ|щ) => shh
regexp_filter = (Ъ|ъ) =>
regexp_filter = (Ы|ы) => y
regexp_filter = (Ь|ь) =>
regexp_filter = (Э|э) => eh
regexp_filter = (Ю|ю) => ju
regexp_filter = (Я|я) => ja
# Чувстветелен к порядку преобразований, не следует объединять разные шаги, даже если очень похожи
regexp_filter = (?i)\b(ai|aj|ay|ei|ey|ej|oi|oj|oy|ui|uj|uy) => 0
regexp_filter = (?i)(a|e|i|o|u|y)(ai|aj|ay|ei|ey|ej|oi|oj|oy|ui|uj|uy) => \11
regexp_filter = (?i)(ai|aj|ay|ei|ey|ej|oi|oj|oy|ui|uj|uy) =>
regexp_filter = (?i)\b(au) => 0
regexp_filter = (?i)(a|e|i|o|u|y)(au) => \17
regexp_filter = (?i)au =>
regexp_filter = (?i)\b(ia|ie|io|iu) => 0
regexp_filter = (?i)(ia|ie|io|iu) =>
regexp_filter = (?i)\b(eu) => 1
regexp_filter = (?i)(a|e|i|o|u|y)(eu) => \11
regexp_filter = (?i)eu =>
regexp_filter = (?i)\b(a|ue|e|i|o|u|y) => 0
regexp_filter = (?i)(a|ue|e|i|o|u|y) =>
regexp_filter = (?i)j => 1
regexp_filter = (?i)\b(schtsch|schtsh|schtch|shtch|shch|shtsh|stch|stsch|strz|strs|stsh|szcz|szcs) => 2
regexp_filter = (?i)(schtsch|schtsh|schtch|shtch|shch|shtsh|stch|stsch|strz|strs|stsh|szcz|szcs) => 4
regexp_filter = (?i)\b(sht|scht|schd|st|szt|shd|szd|sd) => 2
regexp_filter = (?i)(sht|scht|schd|st|szt|shd|szd|sd) => 43
regexp_filter = (?i)(csz|czs|cs|cz|drz|drs|dsh|ds|dzh|dzs|dz|trz|trs|trch|tsh|ttsz|ttz|tzs|tsz|sz|ttch|tch|ttsch|zsch|zhsh|sch|sh|tts|tc|ts|tz|zh|zs) => 4
regexp_filter = (?i)\b(sc) => 2
regexp_filter = (?i)sc => 4
regexp_filter = (?i)(dt|d|th|t) => 3
regexp_filter = (?i)\b(chs|ks|x) => 5
regexp_filter = (?i)(chs|ks|x) => 54
regexp_filter = (?i)(s|z) => 4
regexp_filter = (?i)(ch|ck|c|g|kh|k|q) => 5
regexp_filter = (?i)\b(mn|nm) =>
regexp_filter = (?i)(mn|nm) => 66
regexp_filter = (?i)(m|n) => 6
regexp_filter = (?i)(fb|b|ph|pf|f|p|v|w) => 7
regexp_filter = (?i)\b(h) => 5
regexp_filter = (?i)(a|e|i|o|u|y)h => \15
regexp_filter = (?i)h =>
regexp_filter = (?i)l => 8
regexp_filter = (?i)r => 9
# Настройки индекса
min_infix_len = 2
source = STREETS
path = /var/lib/manticore/DaitchMokotoffSoundex
}
index metaphone
{
# Транслитерация
regexp_filter = (А|а) => a
regexp_filter = (Б|б) => b
regexp_filter = (В|в) => v
regexp_filter = (Г|г) => g
regexp_filter = (Д|д) => d
regexp_filter = (Е|е) => e
regexp_filter = (Ё|ё) => jo
regexp_filter = (Ж|ж) => zh
regexp_filter = (З|з) => z
regexp_filter = (И|и) => i
regexp_filter = (Й|й) => jj
regexp_filter = (К|к) => k
regexp_filter = (Л|л) => l
regexp_filter = (М|м) => m
regexp_filter = (Н|н) => n
regexp_filter = (О|о) => o
regexp_filter = (П|п) => p
regexp_filter = (Р|р) => r
regexp_filter = (С|с) => s
regexp_filter = (Т|т) => t
regexp_filter = (У|у) => u
regexp_filter = (Ф|ф) => f
regexp_filter = (Х|х) => kh
regexp_filter = (Ц|ц) => c
regexp_filter = (Ч|ч) => ch
regexp_filter = (Ш|ш) => sh
regexp_filter = (Щ|щ) => shh
regexp_filter = (Ъ|ъ) =>
regexp_filter = (Ы|ы) => y
regexp_filter = (Ь|ь) =>
regexp_filter = (Э|э) => eh
regexp_filter = (Ю|ю) => ju
regexp_filter = (Я|я) => ja
morphology = metaphone
blend_chars = U+0020
# Настройки индекса
min_infix_len = 2
source = STREETS
path = /var/lib/manticore/Metaphone
}
index rus_metaphone
{
# Переделываем гласные
regexp_filter = (?i)(йо|ио|йе|ие) => и
regexp_filter = (?i)(о|ы|я) => а
regexp_filter = (?i)(е|ё|э) => и
regexp_filter = (?i)(ю) => у
# Для всех согласных букв, за которыми следует любая согласная, кроме Л, М, Н или Р, провести оглушение
regexp_filter = (?i)(б)(б|в|г|д|ж|з|й|к|п|с|т|ф|х|ц|ч|ш|щ) => п\2
regexp_filter = (?i)(г)(б|в|г|д|ж|з|й|к|п|с|т|ф|х|ц|ч|ш|щ) => к\2
regexp_filter = (?i)(в)(б|в|г|д|ж|з|й|к|п|с|т|ф|х|ц|ч|ш|щ) => ф\2
regexp_filter = (?i)(д)(б|в|г|д|ж|з|й|к|п|с|т|ф|х|ц|ч|ш|щ) => т\2
regexp_filter = (?i)(ж)(б|в|г|д|ж|з|й|к|п|с|т|ф|х|ц|ч|ш|щ) => ш\2
regexp_filter = (?i)(з)(б|в|г|д|ж|з|й|к|п|с|т|ф|х|ц|ч|ш|щ) => с\2
# Для согласных на конце слова, провести оглушение
regexp_filter = (?i)б\b => п
regexp_filter = (?i)г\b => к
regexp_filter = (?i)в\b => ф
regexp_filter = (?i)д\b => т
regexp_filter = (?i)ж\b => ш
regexp_filter = (?i)з\b => з
regexp_filter = (?i)(тс|дс|ц) => ц
# Настройки индекса
min_infix_len = 2
source = STREETS
path = /var/lib/manticore/RusMetaphone
}
index caverphone
{
# Транслитерация
regexp_filter = (А|а) => a
regexp_filter = (Б|б) => b
regexp_filter = (В|в) => v
regexp_filter = (Г|г) => g
regexp_filter = (Д|д) => d
regexp_filter = (Е|е) => e
regexp_filter = (Ё|ё) => jo
regexp_filter = (Ж|ж) => zh
regexp_filter = (З|з) => z
regexp_filter = (И|и) => i
regexp_filter = (Й|й) => jj
regexp_filter = (К|к) => k
regexp_filter = (Л|л) => l
regexp_filter = (М|м) => m
regexp_filter = (Н|н) => n
regexp_filter = (О|о) => o
regexp_filter = (П|п) => p
regexp_filter = (Р|р) => r
regexp_filter = (С|с) => s
regexp_filter = (Т|т) => t
regexp_filter = (У|у) => u
regexp_filter = (Ф|ф) => f
regexp_filter = (Х|х) => kh
regexp_filter = (Ц|ц) => c
regexp_filter = (Ч|ч) => ch
regexp_filter = (Ш|ш) => sh
regexp_filter = (Щ|щ) => shh
regexp_filter = (Ъ|ъ) =>
regexp_filter = (Ы|ы) => y
regexp_filter = (Ь|ь) =>
regexp_filter = (Э|э) => eh
regexp_filter = (Ю|ю) => ju
regexp_filter = (Я|я) => ja
regexp_filter = (A|a) => a
regexp_filter = (B|b) => b
regexp_filter = (C|c) => c
regexp_filter = (D|d) => d
regexp_filter = (E|e) => e
regexp_filter = (F|f) => f
regexp_filter = (G|g) => g
regexp_filter = (H|h) => h
regexp_filter = (I|i) => i
regexp_filter = (J|j) => j
regexp_filter = (K|k) => k
regexp_filter = (L|L) => l
regexp_filter = (M|m) => m
regexp_filter = (N|n) => n
regexp_filter = (O|o) => o
regexp_filter = (P|p) => p
regexp_filter = (Q|q) => q
regexp_filter = (R|r) => r
regexp_filter = (S|s) => s
regexp_filter = (T|t) => t
regexp_filter = (U|u) => u
regexp_filter = (V|v) => v
regexp_filter = (W|w) => w
regexp_filter = (X|x) => x
regexp_filter = (Y|y) => y
regexp_filter = (Z|z) => z
# Удалить e на конце
regexp_filter = e\b =>
# Преобразовать начало слова
regexp_filter = \b(cough) => cou2f
regexp_filter = \b(rough) => rou2f
regexp_filter = \b(tough) => tou2f
regexp_filter = \b(enough) => enou2f
regexp_filter = \b(gn) => 2n
regexp_filter = \b(mb) => m2
# Провести замены символов
regexp_filter = (cq) => 2q
regexp_filter = (ci) => si
regexp_filter = (ce) => se
regexp_filter = (cy) => sy
regexp_filter = (tch) => 2ch
regexp_filter = (c|q|x) => k
regexp_filter = (v) => f
regexp_filter = (dg) => 2g
regexp_filter = (tio) => sio
regexp_filter = (tia) => sia
regexp_filter = (d) => t
regexp_filter = (ph) => fh
regexp_filter = (b) => p
regexp_filter = (sh) => s2
# Заменить все гласные в начале слова на A
regexp_filter = (?i)\b(a|e|i|o|u|y) => A
# в остальных случаях — на 3
regexp_filter = (?i)(a|e|i|o|u|y) => 3
# Замены
regexp_filter = (j) => y
regexp_filter = \b(y3) => Y3
regexp_filter = \b(y) => A
regexp_filter = y => 3
regexp_filter = 3gh3 => 3kh3
regexp_filter = gh => 22
regexp_filter = g => k
regexp_filter = s+ => S
regexp_filter = t+ => T
regexp_filter = p+ => P
regexp_filter = k+ => K
regexp_filter = f+ => F
regexp_filter = m+ => M
regexp_filter = n+ => N
regexp_filter = w3 => W3
regexp_filter = wh3 => Wh3
regexp_filter = w\b => 3
regexp_filter = w => 2
regexp_filter = \b(h) => A
regexp_filter = h => 2
regexp_filter = r3 => R3
regexp_filter = r\b => 3
regexp_filter = r => 2
regexp_filter = l3 => L3
regexp_filter = l\b => 3
regexp_filter = l => 2
# Удалить все цифры 2
regexp_filter = 2 =>
# Если на конце слова осталась цифра 3, то заменить её на A
regexp_filter = 3\b => A
# Удалить все цифры 3
regexp_filter = 3 =>
# Настройки индекса
min_infix_len = 2
source = STREETS
path = /var/lib/manticore/Caverphone
}
@excat
Copy link

excat commented Feb 14, 2023

В сфинксе 2.2.11 транслит через regexp_filter не работает что-ли?

@Tkachenko-Ivan
Copy link
Author

Tkachenko-Ivan commented Feb 15, 2023

@excat должно работать, в доках пишут что он появился в версии 2.1.1-beta.

Я на каждой версии не проверял, у меня на тот момент 2.3.2-beta была, если не ошибаюсь.

Как вариант: не все фичи мигрируют из беты в следующий стабильный релиз. Но тут уже наши полномочия всё.

@excat
Copy link

excat commented Feb 15, 2023

@Tkachenko-Ivan Останавливал процесс, удалял индекс (у меня rt кстати). Перестраивал. Не срабатывает. Брал кусок metaphone. При call keywords не работает и на сайте тоже видно по выдаче товара. Жаль что обновить версию не могу. Хотел решить проблему написания иностранных брендов таким способом.

@Tkachenko-Ivan
Copy link
Author

Tkachenko-Ivan commented Feb 15, 2023

@excat с RT индексами такая штука не прокатит к сожалению, по крайней мере мне так кажется, и раз у Вас не сработало, то наверно дело в этом.

Есть вероятность что в manticoresearch это исправили, - там в RT заработали многие вещи которые не работали в Sphinx, но если нет возможности обновиться, - то это совет в никуда.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment