Your IP : 216.73.217.49


Current Path : /var/www/amortar/data/www/a-mortar/bitrix/modules/search/tools/ru/
Upload File :
Current File : /var/www/amortar/data/www/a-mortar/bitrix/modules/search/tools/ru/stemming.php

<?php
global $STEMMING_RU_VOWELS;
$STEMMING_RU_VOWELS = "���������";
global $STEMMING_RU_PERFECTIVE_GERUND;
$STEMMING_RU_PERFECTIVE_GERUND = "/(������|������|������|������|����|����|����|����|��|��|��|��)$/".BX_UTF_PCRE_MODIFIER;

$STEMMING_RU_ADJECTIVE=array("��"=>2, "��"=>2, "��"=>2, "��"=>2, "���"=>3, "���"=>3, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "���"=>2, "���"=>3, "���"=>3, "���"=>3, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2, "��"=>2);
$STEMMING_RU_PARTICIPLE_GR1=array("��"=>2, "��"=>2, "��"=>2, "��"=>2, "�"=>1);
$STEMMING_RU_PARTICIPLE_GR2=array("���"=>3, "���"=>3, "���"=>3);
$STEMMING_RU_ADJECTIVAL_GR1=array();
$STEMMING_RU_ADJECTIVAL_GR2=array();
foreach($STEMMING_RU_ADJECTIVE as $i => $il)
{
	foreach($STEMMING_RU_PARTICIPLE_GR1 as $j => $jl) $STEMMING_RU_ADJECTIVAL_GR1[$j.$i]=$jl+$il;
	foreach($STEMMING_RU_PARTICIPLE_GR2 as $j => $jl) $STEMMING_RU_ADJECTIVAL_GR2[$j.$i]=$jl+$il;
}
global $STEMMING_RU_ADJECTIVAL1;
arsort($STEMMING_RU_ADJECTIVAL_GR1);
$STEMMING_RU_ADJECTIVAL1="/([��])(".implode("|", array_keys($STEMMING_RU_ADJECTIVAL_GR1)).")$/".BX_UTF_PCRE_MODIFIER;

global $STEMMING_RU_ADJECTIVAL2;
foreach($STEMMING_RU_ADJECTIVE as $i => $il)
	$STEMMING_RU_ADJECTIVAL_GR2[$i]=$il;
arsort($STEMMING_RU_ADJECTIVAL_GR2);
$STEMMING_RU_ADJECTIVAL2="/(".implode("|", array_keys($STEMMING_RU_ADJECTIVAL_GR2)).")$/".BX_UTF_PCRE_MODIFIER;

global $STEMMING_RU_VERB1;
$STEMMING_RU_VERB1="/([��])(���|���|���|���|�|�|��|��|��|��|��|��|��|��|�|�|�)$/".BX_UTF_PCRE_MODIFIER;

global $STEMMING_RU_VERB2;
$STEMMING_RU_VERB2="/(����|����|��|��|��|���|���|���|���|���|���|���|���|���|���|���|���|��|��|��|��|��|��|��|��|��|��|��|�)$/".BX_UTF_PCRE_MODIFIER;
global $STEMMING_RU_NOUN;
$STEMMING_RU_NOUN="/(����|���|���|���|���|���|��|��|��|��|��|��|��|��|��|��|��|��|��|���|��|��|��|��|��|��|�|�|�|�|�|�|�|�|�|�)$/".BX_UTF_PCRE_MODIFIER;
function stemming_letter_ru()
{
	return "�����������������������������������������������������";
}
function stemming_ru_sort($a, $b)
{
	$al = strlen($a);
	$bl = strlen($b);
	if($al == $bl)
		return 0;
	elseif($al < $bl)
		return 1;
	else
		return -1;
}
function stemming_stop_ru($sWord)
{
	if(strlen($sWord) < 2)
		return false;
	static $stop_list = false;
	if(!$stop_list)
	{
		$stop_list = array (
			"QUOTE"=>0,"HTTP"=>0,"WWW"=>0,"RU"=>0,"IMG"=>0,"GIF"=>0,"���"=>0,"��"=>0,"���"=>0,
			"���"=>0,"��"=>0,"��"=>0,"��"=>0,"���"=>0,"���"=>0,"��"=>0,"���"=>0,"�"=>0,
			"��"=>0,"���"=>0,"��"=>0,"��"=>0,"���"=>0,"���"=>0,"��"=>0,"��"=>0,"�"=>0,
			"��"=>0,"���"=>0,"��"=>0,"��"=>0,"��"=>0,"����"=>0,"���"=>0,"��"=>0,"��"=>0,
			"���"=>0,"���"=>0,"��"=>0,"��"=>0,"�"=>0,"��"=>0,"��"=>0,"���"=>0,"��"=>0,
			"��"=>0,"��"=>0,"�"=>0,"��"=>0,"��"=>0,"����"=>0,"��"=>0,"���"=>0,"���"=>0,
			"���"=>0,"��"=>0,"��"=>0,"���"=>0,"��"=>0,"��"=>0,"��"=>0,"��"=>0,"���"=>0,
			"�����"=>0,"���"=>0,"��"=>0,"��"=>0,"���"=>0,"���"=>0,"���"=>0,"���"=>0,"���"=>0,
			"��"=>0,"����"=>0,
		);
		if(defined("STEMMING_STOP_RU"))
		{
			foreach(explode(",", STEMMING_STOP_RU) as $word)
			{
				$word = trim($word);
				if(strlen($word)>0)
					$stop_list[$word]=0;
			}
		}
	}
	return !array_key_exists($sWord, $stop_list);
}

function stemming_upper_ru($sText)
{
	return str_replace(array("�"), array("�"), ToUpper($sText, "ru"));
}

function stemming_ru($word, $flags = 0)
{
	global $STEMMING_RU_VOWELS;
	global $STEMMING_RU_PERFECTIVE_GERUND;
	global $STEMMING_RU_ADJECTIVAL1;
	global $STEMMING_RU_ADJECTIVAL2;
	global $STEMMING_RU_VERB1;
	global $STEMMING_RU_VERB2;
	global $STEMMING_RU_NOUN;
	//There is a 33rd letter, � (?), but it is rarely used, and we assume it is mapped into � (e).
	$word=str_replace("�", "�", $word);
	//Exceptions
	static $STEMMING_RU_EX = array(
		"����"=>"����",
		"����"=>"����",
		"����"=>"����",
		"����"=>"����",
		"�����"=>"�����",
		"�������"=>"�������",
		"���"=>"���",
		"����"=>"����",
		"�����"=>"�����",
		"����"=>"����",
		"����"=>"����",
	);
	if(isset($STEMMING_RU_EX[$word]))
		return $word;

	//HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS
	//http://www.gramma.ru/SPR/?id=2.8
	if($flags & 1)
	{
		if(preg_match("/(��|��)$/", $word))
		{
			return array(
				stemming_ru($word."�"),
				stemming_ru($word),
				stemming_ru(substr($word, 0, -2)),
			);
		}
		$found = array();
		if(preg_match("/(��|��)(�|�|��|�)$/", $word, $found))
		{
			return array(
				stemming_ru($word),
				stemming_ru(substr($word, 0, -strlen($found[2]))),
			);
		}
	}
	//HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS

	//In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
	//All tests take place in the the RV part of the word.
	$found=array();
	if(preg_match("/^(.*?[$STEMMING_RU_VOWELS])(.+)$/".BX_UTF_PCRE_MODIFIER, $word, $found))
	{
		$rv = $found[2];
		$word = $found[1];
	}
	else
	{
		return $word;
	}

	//Do each of steps 1, 2, 3 and 4.
	//Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.


	if(preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found))
	{
		switch($found[0])
		{
			case "��":
			case "����":
			case "������":
			case "��":
			case "����":
			case "������":
				$rv = substr($rv, 0, 1-strlen($found[0]));
				break;
			default:
				$rv = substr($rv, 0, -strlen($found[0]));
		}
	}
	//Otherwise try and remove a REFLEXIVE ending, and then search in turn for
	// (1) an ADJECTIVE,
	// (2) a VERB or (3)
	// a NOUN ending.
	// As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
	else
	{
		$rv = preg_replace("/(��|��)$/".BX_UTF_PCRE_MODIFIER, "", $rv);
		//ADJECTIVAL
		if(preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found))
			$rv = substr($rv, 0, -strlen($found[2]));
		elseif(preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found))
			$rv = substr($rv, 0, -strlen($found[0]));
		elseif(preg_match($STEMMING_RU_VERB1, $rv, $found))
			$rv = substr($rv, 0, -strlen($found[2]));
		elseif(preg_match($STEMMING_RU_VERB2, $rv, $found))
			$rv = substr($rv, 0, -strlen($found[0]));
		else
			$rv = preg_replace($STEMMING_RU_NOUN, "", $rv);
	}

	//Step 2: If the word ends with � (i), remove it.
	if(substr($rv, -1) == "�")
		$rv = substr($rv, 0, -1);
	//Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it.
	//R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
	if(preg_match("/(����|���)$/".BX_UTF_PCRE_MODIFIER, $rv))
	{
		$R1=0;
		$rv_len = strlen($rv);
		while( ($R1<$rv_len) && (strpos($STEMMING_RU_VOWELS, substr($rv,$R1,1))!==false) )
			$R1++;
		if($R1 < $rv_len)
			$R1++;
		//R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
		$R2 = $R1;
		while( ($R2<$rv_len) && (strpos($STEMMING_RU_VOWELS, substr($rv,$R2,1))===false) )
			$R2++;
		while( ($R2<$rv_len) && (strpos($STEMMING_RU_VOWELS, substr($rv,$R2,1))!==false) )
			$R2++;
		if($R2 < $rv_len)
			$R2++;
		//"����", "���"
		if((substr($rv, -4) == "����") && ($rv_len >= ($R2+4)))
			$rv = substr($rv, 0, $rv_len - 4);
		elseif((substr($rv, -3) == "���") && ($rv_len >= ($R2+3)))
			$rv = substr($rv, 0, $rv_len - 3);
	}
	//Step 4: (1) Undouble � (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble � (n), or (3) if the word ends � (') (soft sign) remove it.
	$rv = preg_replace("/(����|���)$/".BX_UTF_PCRE_MODIFIER, "", $rv);
	$r = preg_replace("/��$/".BX_UTF_PCRE_MODIFIER, "�", $rv);
	if($r == $rv)
		$rv = preg_replace("/�$/".BX_UTF_PCRE_MODIFIER, "", $rv);
	else
		$rv = $r;

	return $word.$rv;
}
?>