#!/usr/bin/env python
# -*- coding:utf-8 -*-
###########################################################################
# Eole NG - 2007
# Copyright Pole de Competence Eole  (Ministere Education - Academie Dijon)
# Licence CeCill  cf /root/LicenceEole.txt
# eole@ac-dijon.fr
#
# search_projs.py
#
# script de recherche des projets contenant des dictionnaires dans
# la forge eole (sur branche de distribution eole 2.4)
#
###########################################################################


#   Liste des exception connues (2.5.1)
#
#   dictionnaires non retrouvés dans le dépôt GIT :
#
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-bareos-mysql/24_bareos-mysql.xml  --> le répertoire devrait être mysql/dicos
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-ajaxplorer/61_ajaxplorer.xml  --> pas de branches 2.5.X (remplacé par bareos)
#
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-ovs/90_openvswitch.xml                 paquets obsolètes
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-libvirt/97_libvirt.xml                        "
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-one-market/98_one-market.xml           paquets remplacés par les modules hapy
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-one-node/98_one-node.xml                      "
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-one-flow/99_one-flow.xml                      "
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-one-frontend/99_one-frontend.xml              "
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-one-master/99_one-master.xml                  "
#    - /home/bruno/git/zephir/parc/data/dictionnaires/2.5.1/eole/eole-one-singlenode/99_one-singlenode.xml          "
#
#    Recherche des dictionnaires non référencés dans Zéphir ...
#
#    conf-amon.git                 - sondes/dicos : 11_prelude.xml    obsolète (non packagé)
#    conf-eclair.git               - dicos : 30_eclair.xml            module absent en 2.5
#    conf-zephir.git               - dicos : 30_zephir.xml            module non géré dans Zéphir
#    creole.git                    - tests/dicos : 00_base.xml        dictionnaire de tests unitaires
#    etherdraw.git                 - dicos : 61_etherdraw.xml         pas encore publié en candidate
#    zephir-client.git             - dicos : 99_zephir-stats.xml      à ignorer sur Zéphir (pas de variables ni conteneurs)
#


from urllib import urlretrieve, urlopen
from contextlib import closing
import glob
import os
import shutil
import argparse
from bs4 import BeautifulSoup
import re
from itertools import product
import pickle
import logging
from logging.handlers import RotatingFileHandler
logger = logging.getLogger('DictUpdate')
logger_file_handler = RotatingFileHandler('./maj_dicos.log')
logger_stdout_handler = logging.StreamHandler()
logger_stdout_handler.setLevel(logging.INFO)
logger.addHandler(logger_file_handler)
logger.addHandler(logger_stdout_handler)
logger.setLevel(logging.DEBUG)

# A CHANGER SI BESOIN: version de distribution et branche de packaging
#
# pour eole 2.4.0, seuls les paquets recompilés depuis la sortie de la version stable sont détectés

GIT_BRANCHES = {'2.4.0': ['dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
                '2.4.1': ['dist/eole/2.4.1/master', 'dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
                '2.4.2': ['dist/envole/4/master', 'dist/eole/2.4.2/master',
                          'dist/eole/2.4.1/master', 'dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
                '2.5.0': ['dist/envole/4/master', 'dist/eole/2.5.0/master', 'dist/eole/2.5/master',
                          'dist/eole/2.4.2/master', 'dist/eole/2.4.1/master',
                          'dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
                '2.5.1': ['dist/envole/4/master', 'dist/eole/2.5.1/master',
                          'dist/eole/2.5.0/master', 'dist/eole/2.5/master', 'dist/eole/2.4.2/master',
                          'dist/eole/2.4.1/master', 'dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
                '2.5.2': ['dist/envole/5/master', 'dist/envole/4/master',
                          'dist/eole/2.5/master', 'dist/eole/2.5.1/master',
                          'dist/eole/2.5.0/master', 'dist/eole/2.5/master', 'dist/eole/2.4.2/master',
                          'dist/eole/2.4.1/master', 'dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
                '2.6.0': ['dist/envole/5/master', 'dist/envole/4/master',
                          'dist/eole/2.6.0/master',
                          'dist/eole/2.5.2/master', 'dist/eole/2.5.1/master',
                          'dist/eole/2.5.0/master', 'dist/eole/2.5/master', 'dist/eole/2.4.2/master',
                          'dist/eole/2.4.1/master', 'dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
                '2.6.1': ['dist/envole/5/master', 'dist/envole/4/master',
                          'dist/eole/2.6.1/master', 'dist/eole/2.6.1/develop', 'dist/eole/2.6.0/master',
                          'dist/eole/2.5.2/master', 'dist/eole/2.5.1/master',
                          'dist/eole/2.5.0/master', 'dist/eole/2.5/master', 'dist/eole/2.4.2/master',
                          'dist/eole/2.4.1/master', 'dist/eole/2.4.0/master', 'dist/eole/2.4/master'],
               }

PKG_TAGS = {'2.4.0': ['pkg/eole/eole-2.4/'],
            '2.4.1': ['pkg/eole/eole-2.4/'],
            '2.4.2': ['pkg/envole/envole-4/', 'pkg/eole/eole-2.4/'],
            '2.5.0': ['pkg/envole/envole-4/',
                      'pkg/eole/eole-2.5/', 'pkg/eole/eole-2.4/'],
            '2.5.1': ['pkg/envole/envole-4/',
                      'pkg/eole/eole-2.5/', 'pkg/eole/eole-2.4/'],
            '2.5.2': ['pkg/envole/envole-5/', 'pkg/envole/envole-4/',
                      'pkg/eole/eole-2.5/', 'pkg/eole/eole-2.4/'],
            '2.6.0': ['pkg/envole/envole-5/', 'pkg/envole/envole-4/',
                      'pkg/eole/eole-2.6/',
                      'pkg/eole/eole-2.5/', 'pkg/eole/eole-2.4/'],
            '2.6.1': ['pkg/envole/envole-5/', 'pkg/envole/envole-4/',
                      'pkg/eole/eole-2.6/',
                      'pkg/eole/eole-2.5/', 'pkg/eole/eole-2.4/'],
           }

REPOSITORY_COMPONENTS = ['main/', 'cloud/']
# pas d'intérêt à avoir les paquets i386 pour les dictionnaires ?
# REPOSITORY_ARCHS = ['binary-amd64/Packages', 'binary-i386/Packages']
# REPOSITORY_ARCHS = ['binary-amd64/Packages']
# FIXME : se baser seulement sur Sources pour détecter le tag à récupérer ?
# on ne sait pas faire la liaison entre paquet et dictionnaires,
# et tous les paquets d'un projet ont le même numéro.
REPOSITORY_ARCHS = ['source/Sources']

REPOSITORY_BASE_URL = ['http://{0}/eole/dists/{1}/']
REPOSITORY_URLS = [''.join(url) for url in product(REPOSITORY_BASE_URL,
                                                   REPOSITORY_COMPONENTS,
                                                   REPOSITORY_ARCHS)]
ENVOLE_BASE_URL = ['http://{0}/envole/dists/{1}/main/']
ENVOLE_REPOS_URLS = [''.join(url) for url in product(ENVOLE_BASE_URL,
                                                  REPOSITORY_ARCHS)]
# XXX FIXME
#
# rendre paramétrable les dépôts ? (stable : ['', 'security', 'updates'], prestable : [''], candidat : ['', 'security', 'updates', 'proposed-updates'])
# gestion des paquets présents dans plusieurs dépots ? (ex : updates + stable) -> garder seulement le plus récent.

REPOSITORIES = {'2.4.0': ['eole-2.4.0',
                          'eole-2.4.0-security',
                          'eole-2.4.0-updates' ],
                '2.4.1': ['eole-2.4.1',
                          'eole-2.4.1-security',
                          'eole-2.4.1-updates' ],
                '2.4.2': ['eole-2.4.2',
                          'eole-2.4.2-security',
                          'eole-2.4.2-updates' ],
                '2.5.0': ['eole-2.5.0',
                          'eole-2.5.0-security',
                          'eole-2.5.0-updates' ],
                '2.5.1': ['eole-2.5.1',
                          'eole-2.5.1-security',
                          'eole-2.5.1-updates'],
                '2.5.2': ['eole-2.5.2',
                          'eole-2.5.2-security',
                          'eole-2.5.2-updates'],
                '2.6.0': ['eole-2.6.0',
                          'eole-2.6.0-security',
                          'eole-2.6.0-updates' ],
                '2.6.1': ['eole-2.6.1',
                          'eole-2.6.1-proposed-updates']
                }

ENVOLE_VERS = {'2.4.2':'envole-4',
               '2.5.0':'envole-4',
               '2.5.1':'envole-4',
               '2.5.2':'envole-5',
               '2.6.0':'envole-5',
               '2.6.1':'envole-5-testing',
               }

# 2.5.1 par défaut
# utiliser l'option l'option -d (--distrib) pour les versions antérieures
EOLE_VERSION = '2.6.1'

# URL pour vérification de l'existence d'un fichier dans un projet (utiliser
# .format(projet, commit, fichier))
GITWEB_URL = 'http://dev-eole.ac-dijon.fr/gitweb/'
PROJECT_URL = 'http://dev-eole.ac-dijon.fr/gitweb/?p={0};a=tree;h={1};hb={1}'
DICT_URL = 'http://dev-eole.ac-dijon.fr/gitweb/?p={0};a=tree;f={2};hb={1}'
CONTROL_URL = 'http://dev-eole.ac-dijon.fr/gitweb/gitweb.cgi?p={0};a=blob_plain;f=debian/control;hb=refs/heads/{1}'
ACCESS_ERROR = '403 Forbidden - Reading tree failed'
NOT_FOUND_ERROR = '404 - Reading tree failed'
DOWNLOAD_ERROR = '404 - Cannot find file'
RAW_DICT_URL = 'http://dev-eole.ac-dijon.fr/gitweb/gitweb.cgi?p={0};a=blob_plain;f={2};hb={1}'

# chemin par défaut des dictionnaires dans le projet zephir. A modifier si non lancé depuis le répertoire 'outils'
TOOLS_DIR = os.path.dirname(os.path.join(os.path.abspath(__file__)))


# Projets à exclure lors de la vérification
MEDDEE_PROJ = set(['conf-ecdl.git', 'conf-esbl.git',
                   'ecdl-outils.git', 'esbl-glpi.git',
                   'esbl-grr.git', 'esbl-ocs.git',
                   'geo-ide-base.git', 'geo-ide-distribution.git',
                   'supervision-psin.git', 'eole-antivir2.git'])
EXCLUDE_PROJ = set(['eole-skeletor.git', 'zephir-stats.git',
                    'sandbox.git', 'sandbox2.git',
                    'test1.git', 'test2.git',
                    'test3.git', 'test4.git',
                    'test5.git', 'test6.git',
                    'test7.git', 'test8.git',
                    'test9.git', 'creole.git'])
# sous réprertoires de projets à exclure (ex : src pour les dictionnaries de test de gen_config)
EXCLUDE_SUBDIR = ['src']

# dictionnaire spécifiques aux modules sur Zéphir, à ne pas prendre en compte
# lors de l'affichage des dictionnaires non retouvés dans le dépôt GIT
ZEPHIR_IGNORE = ['29_zephir_container.xml', '29_zephir_redefine.xml']
# liste des noms de répertoires pouvant contenir des dictionnaires (ajouter les éventuels répertoires différents de 'dicos']
DICT_DIRS = ['dicos', 'dicos_mysql']

HASH_H_RE = re.compile(r'h=(?P<hash>[a-z0-9]*)')
HASH_HB_RE = re.compile(r'hb=(?P<hash>[a-z0-9]*)')
FOLDER_RE = re.compile(r'a=tree')
# RegExp de recherche des répertoires de dictionnaires dans les projets
# matche les répertoire de type DICT_DIRS, en excluant les chemins commençant par EXCLUDE_SUBDIR
dico_match = r'a=blob.*f=(?P<folder>(?!{})(.*/))?({})/((?P<subfolder>.*)/)?(?P<name>[\w-]*\.xml)'
DICO_RE = re.compile(dico_match.format('|'.join([sdir+"/" for sdir in EXCLUDE_SUBDIR]), '|'.join(DICT_DIRS)))
print dico_match.format('|'.join([sdir+"/" for sdir in EXCLUDE_SUBDIR]), '|'.join(DICT_DIRS))

FILE_RE = re.compile(r'a=blob')


def parse_package_list(package_list):
    """
    Return package list relevant information.
    :param package_list: package list as found in debian repo
    :type package_list: str
    """
    packages = [p for p in package_list.split('\n\n') if p != '']
    logger.debug("{} paquets lus depuis le dépôt".format(len(packages)))
    package_re = re.compile(r'^Package:\s*(?P<package>.*)$', re.M)
    version_re = re.compile(r'^Version:\s*(?P<version>.*)$', re.M)
    versions = {}
    for p in packages:
        source_name = package_re.search(p).group('package')
        version = version_re.search(p).group('version').replace('~', '_')
        versions[source_name] = version
    return versions

def get_package_versions(distribution, test):
    """
    Return packages version for given distribution
    :param distribution: distribution number
    :type distribution: str
    """
    if test:
        server_name = "test-eole.ac-dijon.fr"
    else:
        server_name = "eole.ac-dijon.fr"
    logger.debug("Récupération des paquets et versions publiés")
    # paquets EOLE
    repos = [url.format(server_name, repo)
             for url in REPOSITORY_URLS
             for repo in REPOSITORIES[distribution]]
    # paquets ENVOLE
    if distribution in ENVOLE_VERS:
        repos.extend([url.format(server_name, ENVOLE_VERS[distribution])
                      for url in ENVOLE_REPOS_URLS])
    packages = {}
    for repo_url in repos:
        with closing(urlopen(repo_url)) as content:
            if content.code == 200:
                repo = content.read().decode('utf-8')
            else:
                # certains dépôts peuvent être inexistants (ex : 'cloud' en 2.4.0)
                logger.debug("URL non accessible (code {0}) : {1})".format(content.code, repo_url))
                continue
        logger.info(u'Parsing source packages from {0}'.format(repo_url))
        packages.update(parse_package_list(repo))
    return packages


def get_project_names(gitweb_url, version):
    """
    Return project names set from gitweb
    """
    logger.debug("Récupération des noms de projets présents sur gitweb")
    with closing(urlopen(gitweb_url)) as content:
        html_data = BeautifulSoup(content.read(), 'lxml')
    project_table = html_data.find('table', attrs={'class': 'project_list'})

    projects = []
    source_re = re.compile(r'^Source:\s*(?P<source>.*)$', re.M)
    for p in project_table.find_all('a',
                                    attrs={'class': 'list'},
                                    title=None):
        source_name = None
        project = p.text
        for dist_branch in GIT_BRANCHES[version]:
            with closing(urlopen(CONTROL_URL.format(project, dist_branch))) as content:
                if content.code == 200:
                    data_control = content.read().decode('utf-8')
                    search_res = source_re.search(data_control)
                    if search_res:
                        source_name = search_res.group('source')
                        logger.debug('source trouvée pour {0} : {1}'.format(project, source_name))
                    else:
                        logger.debug('Source non trouvée dans le fichier control ({0})'\
                                .format(CONTROL_URL.format(project, dist_branch)))
                    break
        if source_name is None:
            logger.debug('Pas de source trouvée pour {0}'.format(project))
        # recherche du lien entre le nom de dépôt et de la source (control)
        projects.append((project, source_name))
    logger.debug("{} noms et source des projets récupérés".format(len(projects)))
    return set(projects)


def get_gitweb_tree(root, depth=3):
    """
    Return files in repository through gitweb tree view.
    :param root: root directory
    :type root: str
    """
    depth = depth
    urls = []
    tree_table = root.find('table', attrs={'class': 'tree'})
    tree_links = tree_table.find_all('td', attrs={'class': 'link'})
    # tree_links = [tl.find('a')['href'] for tl in tree_links
    #              if tl.find('a') is not None and
    #              tl.find('a').text in ['tree', 'blob']]
    tree_links = [link['href'] for links in tree_links
                  for link in links.find_all('a')
                  if link and link.text in ['tree', 'raw']]
    urls.extend(tree_links)
    if depth:
        for fl in (tl for tl in tree_links if FOLDER_RE.search(tl)):
            next_url = GITWEB_URL + fl
            with closing(urlopen(next_url)) as content:
                next_tree = BeautifulSoup(content.read(), 'lxml')
            urls.extend(get_gitweb_tree(next_tree, depth=depth-1))
    return urls


def get_filtered_urls(urls, filter_re):
    """
    Return filtered urls list
    :param urls: urls list
    :type urls: list of strings
    :param filter_re: compiled regular expression to filter with
    :type filter_re: SRE_Pattern
    """
    filtered_urls = [u for u in urls if filter_re.search(u)]
    logger.debug('Filtered url list has length {}'.format(len(filtered_urls)))
    return filtered_urls


def get_xml_dicts(urls):
    """
    Return xml dictionaries filtering urls list
    :param urls: urls list
    :type urls: list of string
    """
    return get_filtered_urls(urls, filter_re=DICO_RE)


def _get_packaging_tag_hashs(project, source_name, pkg_tag, published=None):
    """
    Return commit hash corresponding to last packaging tag
    :param project: project name as used in gitweb
    :type project: str
    :param distribution: string identifying distribution
    :type distribution: str
    """
    project_name = os.path.splitext(project)[0]
    if published and source_name in published:
        version = published[source_name]
        #compile_tag = re.compile(r'{0}(?P<version>){1}'.format(pkg_tag, version))
        #logger.info("Recherche des tags de la forme {0} (r'{0}(?P<version>){1}')".format(pkg_tag, version))
        published_tag = '{0}{1}'.format(pkg_tag, version)
        logger.debug("Recherche du tag {0}".format(published_tag))
    else:
        # no known published version for project
        return None
        #compile_tag = re.compile(r'{0}(?P<version>.*)'.format(pkg_tag))
    tags_url = GITWEB_URL + "gitweb.cgi?p={0};a=tags"
    with closing(urlopen(tags_url.format(project))) as content:
        project_tags = BeautifulSoup(content.read(), 'lxml')
    tags_body = project_tags.find('table', attrs={'class': 'tags'})
    if tags_body is None:
        hashs = None
    else:
        hashs = [HASH_H_RE.search(t['href']).group('hash')
                 for t in tags_body.find_all('a', attrs={'class': 'list name'})
                 if t.text.strip() == published_tag]
    return hashs


def _get_tree_url_hashs_for_commit(project, commit):
    """
    Return project tree url with HEAD on given commit
    :param project: project name as used in gitweb
    :type project: str
    :param commit: hash identifying a commit
    :type commit: unicode
    """
    tag_url = GITWEB_URL + "gitweb.cgi?p={0};a=commit;h={1}"
    with closing(urlopen(tag_url.format(project, commit))) as content:
        tag_desc = BeautifulSoup(content.read(), 'lxml')
    page_nav = tag_desc.find('div', attrs={'class': 'page_nav'})
    tree_url = page_nav.find('a', text='tree')['href']
    h = HASH_H_RE.search(tree_url).group('hash')
    hb = HASH_HB_RE.search(tree_url).group('hash')
    return (h, hb)


def _get_tree_url_for_commit(project, commit):
    """
    Return project tree url with HEAD on given commit
    :param project: project name as used in gitweb
    :type project: str
    :param commit: hash identifying a commit
    :type commit: unicode
    """
    hashs = _get_tree_url_hashs_for_commit(project, commit)
    tree_url = GITWEB_URL + "gitweb.cgi?p={0};a=tree;h={1};hb={2}"
    return tree_url.format(project, *hashs)


def get_project_tree_url(project, source_name, pkg_tag, published=None):
    """
    Return project url for most recent pkg tag corresponding to
    pkg_tag for given project.
    :param project: project name as used in gitweb
    :type project: str
    :param pkg_tag: packaging tag pattern
    :type pkg_tag: str
    """
    hashs = _get_packaging_tag_hashs(project, source_name, pkg_tag, published)
    if hashs:
        elected_hash = hashs[0]
        tree_url = _get_tree_url_for_commit(project, elected_hash)
    else:
        tree_url = None
        elected_hash = None
    return {'tree_url': tree_url, 'commit': elected_hash}


def get_zephirdicts(eole_version=EOLE_VERSION):
    """renvoie la liste de tous les dictionnaires connus
       pour une version de la distribution
    """
    all_dicts = []
    paq_dicts = {}
    if eole_version == "2.4.0":
        path_version = "2.4"
    else:
        path_version = eole_version
    zephir_pool = os.path.join(os.path.dirname(TOOLS_DIR), 'data',
                               'dictionnaires', path_version)
    for dict_zeph in glob.glob(os.path.join(zephir_pool, 'eole', '*', '*.xml')):
        dict_name = os.path.basename(dict_zeph)
        dict_path = os.path.dirname(dict_zeph)
        all_dicts.append(dict_name)
        paq_dicts[dict_name] = dict_path
    all_dicts.sort()
    return all_dicts, paq_dicts


def load_cached_projects(cache_file):
    """
    Return projects urls from previous run
    :param cache_file: cache file path
    :type cache_file: str
    """
    try:
        with open(cache_file, 'r') as cached_projects:
            projects = pickle.load(cached_projects)
    except:
        projects = ({}, {})
    return projects


def get_dictprojs(cache, download=True, exclude=EXCLUDE_PROJ, version=EOLE_VERSION, test=False):
    """parcourt tous les projets existants et
       sélectionne ceux contenant des dictionnaires
    """
    download_dir = os.path.join(TOOLS_DIR,
                                'download_dicts_{0}'.format(version))
    if download:
        if os.path.isdir(download_dir):
            if os.path.isdir("{0}_backup".format(download_dir)):
                shutil.rmtree("{0}_backup".format(download_dir))
            shutil.move(download_dir, "{0}_backup".format(download_dir))
            logger.info("Répertoire de download précédent sauvegardé (download_dicts_backup)")
        os.makedirs(download_dir)

    # lecture de la liste de projets
    logger.info('Recherche des projets et des tag de publication ...')
    candidate_projects = sorted(get_project_names(GITWEB_URL, version).difference(exclude))
    projects_count = len(candidate_projects)
    published = get_package_versions(version, test)
    projects, download_urls = cache
    for ind, project in enumerate(candidate_projects):
        project, source_name = project
        if source_name not in published:
            logger.debug('no knwown published source, project {0} ignored'.format(project))
            continue
        else:
            logger.debug('Searching packaging tag with version {0}'.format(published[source_name]))
        logger.info('Recherche dans {0}\t{1}/{2}'.format(project, ind+1, projects_count))
        tag_found = False
        for pkg_tag in PKG_TAGS[version]:
            project_tree_url = get_project_tree_url(project, source_name, pkg_tag, published=published)
            if project_tree_url['tree_url'] is None:
                logger.debug("Pas de tag de packaging de la forme {}".format(pkg_tag))
                tree_body = None
            else:
                logger.debug("Lien trouvé pour le tag {}".format(pkg_tag))
                tree_url = project_tree_url['tree_url']
                with closing(urlopen(tree_url)) as content:
                    project_tree = BeautifulSoup(content.read(), 'lxml')
                tree_body = project_tree.find('div',
                                              attrs={'class': 'page_body'})
                tag_found = True
                break
        if not tag_found:
            logger.error('!! Aucun tag valide trouvé pour {0} (version : {1})'.format(project, published[source_name]))
        if tree_body:
            if project in projects and \
               projects[project]['commit'] == project_tree_url['commit']:
                logger.debug("Liens associés au commit {} déjà archivés".format(projects[project]['commit']))
                continue
            else:
                logger.debug("Récupération de l'arborescence du projet")
                urls = get_gitweb_tree(tree_body)
                logger.debug("Filtrage des dictionnaires XML")
                dict_urls = get_xml_dicts(urls)
                projects[project] = project_tree_url
                if dict_urls or True:
                    download_urls[project] = dict_urls
                    logger.debug(project_tree_url)
                    logger.debug(dict_urls)
    return projects, download_urls


def extract_dict_infos(url):
    """
    Return name and folder encoded in url
    :param url: gitweb url for dictionnary
    :type url: str
    """
    dict_infos = DICO_RE.search(url)
    dict_name = dict_infos.group('name')
    folder = dict_infos.group('folder') or ''
    subfolder = dict_infos.group('subfolder') or ''
    dict_dir = folder + subfolder
    return (dict_name, dict_dir)


def download_dicts(dl_urls, exclude=EXCLUDE_PROJ, version=EOLE_VERSION):
    """
    Download xml file corresponding to projects
    :param dl_urls: urls pointing to xml for each projects
    :type dl_urls: dict
    :param exclude: projects to ignore
    :type exclude: set
    :param version: distribution version
    :type version: str
    """
    download_dir = os.path.join(TOOLS_DIR, 'download_dicts_{0}'.format(version))
    logger.info("Téléchargement des dictionnaires")
    for project, dl_info in dl_urls.items():
        if project not in exclude:
            project = os.path.splitext(project)[0]
            for dict_url in dl_info:
                dict_name, dict_dir = extract_dict_infos(dict_url)
                msg = "Téléchargement du dictionnaire {}"
                logger.debug(msg.format(dict_name))
                proj_dir = os.path.join(download_dir, project, dict_dir)
                if not os.path.isdir(proj_dir):
                    os.makedirs(proj_dir)
                f_dl = os.path.join(proj_dir, dict_name)
                # on vérifie que le fichier est bien trouvé
                with closing(urlopen(GITWEB_URL+dict_url)) as content:
                    dict_tree = BeautifulSoup(content.read(), 'lxml')
                if DOWNLOAD_ERROR in dict_tree.contents:
                    print "Erreur 404 :", dict_url
                else:
                    # téléchargement
                    urlretrieve(GITWEB_URL+dict_url, f_dl)


def check_missing(dl_infos, exclude=EXCLUDE_PROJ, version=EOLE_VERSION):
    """
    Return dictionaries downloaded but not copied in zephir folder
    :param dl_infos: urls pointing to xml for each projects
    :tàpe dl_infos: dict
    :param exclude: projects to ignore
    :type exclude: set
    :param version: distribution version
    :type version: str
    """
    known_dicts = set(get_zephirdicts(version)[0])
    proj_names = dl_infos.keys()
    proj_names.sort()
    unknown_dicts = [DICO_RE.search(url).group('name')
                     for git_proj in set(proj_names).difference(exclude)
                     for url in dl_infos[git_proj]]
    unknown_dicts = set(unknown_dicts).difference(known_dicts)
    msg = "Les dictionnaires suivants ne sont pas référencés :\n\n\t{}"
    msg = msg.format('\n\t'.join(unknown_dicts))
    logger.warn(msg)
    return unknown_dicts


def update_dicos(dl_infos, exclude=EXCLUDE_PROJ, version=EOLE_VERSION):
    """
    Copy dictionaries in zephir folder
    :param dl_infos: urls pointing to xml for each projects
    :tàpe dl_infos: dict
    :param exclude: projects to ignore
    :type exclude: set
    :param version: distribution version
    :type version: str
    """
    # tri des dicos trouvés sur le dépôt : nom dictionnaire -> chemin dans download_dir
    # si des doublons sont trouvés, on remonte une erreur
    download_dir = os.path.join(TOOLS_DIR, 'download_dicts_{0}'.format(version))
    dict_git = {}
    errors = []
    for proj, dict_infos in dl_infos.items():
        if proj not in exclude:
            proj = os.path.splitext(proj)[0]
            for xml, dict_dir in (extract_dict_infos(dict) for dict in dict_infos):
                if xml in dict_git:
                    errors.append("* le dictionnaire {0} ({1}, {2}) est déjà référencé : {3}".format(xml, proj, dict_dir, dict_git[xml]))
                dict_git[xml] = os.path.join(download_dir, proj, dict_dir, xml)
    if errors:
        print "!! ERREURS DETECTEES !!"
        print "\n".join(errors)
        return False

    # parcours des dictionnaires déjà en place dans Zéphir
    known_dicts, paths = get_zephirdicts(version)
    not_found = []
    for zeph_dict in known_dicts:
        if zeph_dict in dict_git:
            # copie du dictionnaire dans le dépôt local
            dst_dict = os.path.join(paths[zeph_dict], zeph_dict)
            print "mise en place de la version du dépôt: {0} ({1})".format(zeph_dict, paths[zeph_dict].split('data/dictionnaires/')[-1])
            shutil.copy(dict_git[zeph_dict], dst_dict)
        elif zeph_dict not in ZEPHIR_IGNORE:
            # dictionnaire présent sur Zéphir mais non retrouvé dans les dépots
            not_found.append(os.path.join(paths[zeph_dict], zeph_dict))
    if not_found:
        # affichage des dictionnaires présents sur Zéphir mais non retrouvés
        print "\ndictionnaires non retrouvés dans le dépôt GIT :\n\n- {0}".format("\n- ".join(not_found))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=u"Outil de vérification des dictionnaires Zéphir depuis gitweb")

    parser.add_argument('-s', '--search', action="store_true", default=False,
                        help=u"Force la recherche dans l'ensemble des projets")

    parser.add_argument('-n', '--no-dl', action="store_true", default=False,
                        help=u"pas de téléchargement, utilise cache local si présent")

    parser.add_argument('-t', '--test', action="store_true", default=False,
                        help=u"Utilise le dépôt test-eole.ac-dijon.fr au lieu de eole.ac-dijon.fr")
    dist_choices = GIT_BRANCHES.keys()
    dist_choices.sort()
    dist_choices.insert(0, u'all')
    parser.add_argument('--distrib', help="version de la distribution",
                        choices=dist_choices, default=EOLE_VERSION)

    options = parser.parse_args()

    if options.distrib == u'all':
        eole_versions = PKG_TAGS.keys()
    else:
        eole_versions = [options.distrib, ]

    # Exclusions par défaut pour la distribution Eole
    exclusions = EXCLUDE_PROJ.union(MEDDEE_PROJ)

    for eole_version in eole_versions:
        msg = "Recherche des dictionnaires pour la distribution {}"
        logger.info(msg.format(eole_version))
        cache = './cached_projects/cached_projects_{0}'.format(eole_version)
        if os.path.isfile(cache):
            old_cache = "./cached_projects_{0}.backup".format(eole_version)
            msg = "Copie des résultats de la recherche précédente dans {}"
            logger.info(msg.format(old_cache))
            shutil.copy(cache, old_cache)
        if options.search or not os.path.isfile(cache):
            entry_points, dl_infos = get_dictprojs(load_cached_projects(cache), exclude=exclusions, version=eole_version, test=options.test)
            with open(cache, 'w') as cache_file:
                pickle.dump((entry_points, dl_infos), cache_file)
                msg = "Localisation des dictionnaires sauvée dans {}"
                logger.info(msg.format(cache))
        else:
            entry_points, dl_infos = load_cached_projects(cache)
        if options.no_dl is False:
            download_dicts(dl_infos, exclude=exclusions, version=eole_version)
        logger.info("Mise à jour des dictionnaires existants …")
        update_dicos(dl_infos, exclude=exclusions, version=eole_version)
        logger.info("Recherche des dictionnaires non référencés dans Zéphir …")
        check_missing(dl_infos, exclude=exclusions, version=eole_version)
        logger.info("Mise à jour des dictionnaires effectuée.")
