diff options
author | Remy Pecqueur <remy.pecqueur@student.ecp.fr> | 2012-01-04 20:58:04 +0100 |
---|---|---|
committer | Remy Pecqueur <remy.pecqueur@student.ecp.fr> | 2012-01-04 20:58:04 +0100 |
commit | d49b7c4095432e18ccee856c15ba91c4d2e7464c (patch) | |
tree | 8e5972e27669b4d2d1fa9f31ac96ffc1947e582e | |
parent | 08a5911bc2392792d31a5cdfa5780af1398167c4 (diff) |
Functional vector searches
-rw-r--r-- | app/Resources/views/layout.html.twig | 3 | ||||
-rw-r--r-- | app/Resources/views/vector_search.html.twig | 4 | ||||
-rw-r--r-- | src/Riu/Controller/Search.php | 17 | ||||
-rw-r--r-- | src/Riu/Provider/RiuServiceProvider.php | 6 | ||||
-rw-r--r-- | src/Riu/Search/FrequencyWeighter.php | 10 | ||||
-rw-r--r-- | src/Riu/Search/TfIdfWeighter.php | 20 | ||||
-rw-r--r-- | src/Riu/Search/VectorEngine.php | 11 |
7 files changed, 49 insertions, 22 deletions
diff --git a/app/Resources/views/layout.html.twig b/app/Resources/views/layout.html.twig index d180d61..cef03b5 100644 --- a/app/Resources/views/layout.html.twig +++ b/app/Resources/views/layout.html.twig @@ -25,7 +25,8 @@ <div class="container"> <a href="{{ path('homepage') }}" class="brand">Riu search</a> <ul class="nav"> - <li class="{{ active_page|default('') == 'vector' ? 'active': '' }}"><a href="{{ path('vector_search') }}">Recherche vectorielle</a></li> + <li class="{{ active_page|default('') == 'vector_frequency' ? 'active': '' }}"><a href="{{ path('vector_frequency_search') }}">Recherche vectorielle (fréquence)</a></li> + <li class="{{ active_page|default('') == 'vector_tfidf' ? 'active': '' }}"><a href="{{ path('vector_tfidf_search') }}">Recherche vectorielle (Tf-Idf)</a></li> <li class="{{ active_page|default('') == 'boolean' ? 'active': '' }}"><a href="{{ path('boolean_search') }}">Recherche booléenne</a></li> <li class="{{ active_page|default('') == 'probability' ? 'active': '' }}"><a href="{{ path('probability_search') }}">Recherche probabiliste</a></li> </ul> diff --git a/app/Resources/views/vector_search.html.twig b/app/Resources/views/vector_search.html.twig index 672fabc..5e26872 100644 --- a/app/Resources/views/vector_search.html.twig +++ b/app/Resources/views/vector_search.html.twig @@ -1,13 +1,13 @@ {% extends 'layout.html.twig' %} -{% set active_page = 'vector' %} +{% set active_page = type %} {% block content %} <div class="page-header"> <h2>Recherche vectorielle</h2> </div> <div> - <form action="{{ path('vector_search') }}" method="GET"> + <form action="{{ path(type ~ '_search') }}" method="GET"> <div class="input"> <label for="search">Critère:</label> <input type="search" name="q" value="{{ query|default('') }}" id="search" /> diff --git a/src/Riu/Controller/Search.php b/src/Riu/Controller/Search.php index ada1f63..3977141 100644 --- a/src/Riu/Controller/Search.php +++ b/src/Riu/Controller/Search.php @@ -34,15 +34,26 @@ class Search implements ControllerProviderInterface })->bind('boolean_search'); $controllers->get('/vector', function(Application $app, Request $request){ - $params = array(); + $params = array('type' => 'vector_frequency'); + if ($request->query->has('q')) { + $query = $request->query->get('q'); + $params['query'] = $query; + $params['results'] = $app['search.engine.vector.frequency']->search($query); + } + + return $app['twig']->render('vector_search.html.twig', $params); + })->bind('vector_frequency_search'); + + $controllers->get('/vector/tf_idf', function(Application $app, Request $request){ + $params = array('type' => 'vector_tfidf'); if ($request->query->has('q')) { $query = $request->query->get('q'); $params['query'] = $query; - $params['results'] = $app['search.engine.vector']->search($query); + $params['results'] = $app['search.engine.vector.tfidf']->search($query); } return $app['twig']->render('vector_search.html.twig', $params); - })->bind('vector_search'); + })->bind('vector_tfidf_search'); $controllers->get('/document/{id}', function(Application $app, $id){ $document = $app['search.matcher']->getDocument($id); diff --git a/src/Riu/Provider/RiuServiceProvider.php b/src/Riu/Provider/RiuServiceProvider.php index 73dfa24..33c2a51 100644 --- a/src/Riu/Provider/RiuServiceProvider.php +++ b/src/Riu/Provider/RiuServiceProvider.php @@ -46,10 +46,14 @@ class RiuServiceProvider implements ServiceProviderInterface return new BooleanEngine($app['search.matcher']); }); - $app['search.engine.vector'] = $app->share(function(Application $app){ + $app['search.engine.vector.frequency'] = $app->share(function(Application $app){ return new VectorEngine($app['search.indexer'], $app['search.matcher'], $app['search.weighter.frequency']); }); + $app['search.engine.vector.tfidf'] = $app->share(function(Application $app){ + return new VectorEngine($app['search.indexer'], $app['search.matcher'], $app['search.weighter.tfidf']); + }); + $app['search.engine.probability'] = $app->share(function(Application $app){ return new ProbabilityEngine($app['search.matcher'], $app['search.indexer']); }); diff --git a/src/Riu/Search/FrequencyWeighter.php b/src/Riu/Search/FrequencyWeighter.php index c49bd07..7092fb3 100644 --- a/src/Riu/Search/FrequencyWeighter.php +++ b/src/Riu/Search/FrequencyWeighter.php @@ -25,12 +25,10 @@ class FrequencyWeighter implements WeighterInterface } } - foreach ($this->matcher->getDocuments() as $id => $doc) { - if (!empty($weight[$id])) { - $max = max($weight[$id]); - foreach ($weight[$id] as $word => $w) { - $weight[$id][$word] /= $max; - } + $max = max(max($weight)); + foreach ($weight as $id => $t) { + foreach ($weight[$id] as $word => $w) { + $weight[$id][$word] /= $max; } } diff --git a/src/Riu/Search/TfIdfWeighter.php b/src/Riu/Search/TfIdfWeighter.php index 609d2d3..2027b94 100644 --- a/src/Riu/Search/TfIdfWeighter.php +++ b/src/Riu/Search/TfIdfWeighter.php @@ -32,12 +32,11 @@ class TfIdfWeighter implements WeighterInterface } } - foreach ($documents as $id => $doc) { - if (!empty($weight[$id])) { - $max = max($weight[$id]); - foreach($weight[$id] as $word => $w) { - $weight[$id][$word] /= $max; - } + $max = max(max($weight)); + + foreach ($weight as $id => $t) { + foreach($weight[$id] as $word => $w) { + $weight[$id][$word] /= $max; } } @@ -60,14 +59,17 @@ class TfIdfWeighter implements WeighterInterface $idf = log( count($documents) / count($wordUses) ); $tf = log( $count / $sum + 1 ); $weight[$word] = $tf * $idf; + } else { + $weight[$word] = 0; } } $max = max($weight); - foreach ($weight as $word => $w) { - $weight[$word] /= $max; + if ($max != 0) { + foreach ($weight as $word => $w) { + $weight[$word] /= $max; + } } - return $weight; } } diff --git a/src/Riu/Search/VectorEngine.php b/src/Riu/Search/VectorEngine.php index fbf9b57..dff1acd 100644 --- a/src/Riu/Search/VectorEngine.php +++ b/src/Riu/Search/VectorEngine.php @@ -28,6 +28,8 @@ class VectorEngine implements EngineInterface $queryDoc = new Document('-1', $query); $queryWords = $this->indexer->indexDocument($queryDoc); $similarity = array(); + $docNorm = array(); + $queryNorm = 0; if (empty($queryWords)) { return array(); @@ -41,14 +43,23 @@ class VectorEngine implements EngineInterface $docWeight = $this->weighter->getDocumentWeights($queryWords); foreach ($queryWords as $word => $t) { + $queryNorm += $queryWeight[$word] * $queryWeight[$word]; foreach ($this->matcher->getWordUses($word) as $id => $count) { if (isset($similarity[$id])) { $similarity[$id] += $queryWeight[$word] * $docWeight[$id][$word]; + $docNorm[$id] += $docWeight[$id][$word] * $docWeight[$id][$word]; } else { $similarity[$id] = $queryWeight[$word] * $docWeight[$id][$word]; + $docNorm[$id] = $docWeight[$id][$word] * $docWeight[$id][$word]; } } } + $queryNorm = sqrt($queryNorm); + + foreach ($similarity as $id => $s) { + $docNorm[$id] = sqrt($docNorm[$id]); + $similarity[$id] = $s / ($docNorm[$id] * $queryNorm); + } arsort($similarity); |