commit inicial

staticdev · staticdev · commit 9884a4ed19d7 · 2018-02-08T05:04:28.000-02:00
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,28 @@
+FROM python:2.7.14-slim
+
+# set the working directory to /app
+WORKDIR /app
+
+# copy the current directory contents into the container at /app
+COPY . /app
+
+# install git flex libcpanplus-perl make
+RUN apt-get update && \
+    apt-get install -y git flex libcpanplus-perl make
+
+# install perl libs
+RUN export PERL_MM_USE_DEFAULT=1 && perl -MCPAN -e 'install List::MoreUtils; install Text::LevenshteinXS; install Parallel::Loops'
+
+# install requirements
+RUN pip install -r requirements.txt
+
+# download UGCNormal
+RUN git clone https://github.com/carolcoimbra/UGCNormal.git ugc_norm
+
+# configure UGCNormal
+RUN cd ugc_norm && sh configure.sh
+
+EXPOSE 5000
+
+# run app.py when the container launches
+CMD ["gunicorn", "app:APP", "-b", ":5000"]
diff --git a/README.rst b/README.rst
@@ -0,0 +1,80 @@
+ugcnormal-microservice
+======================
+
+Microsserviço REST para normalização pt_BR usando o `UGCNormal <https://github.com/avanco/UGCNormal>`_. Ideal para aplicações que precisam de normalização online como chatbots.
+
+Webservice baseado no ugcnormal_interface `<https://github.com/thiagootuler/ugcnormal_interface>`_.
+
+Requisitos
+----------
+
+* Instalar Docker-CE 17.12.0+
+* 900 Mb de espaço em disco para imagem
+
+Execução
+--------
+
+Rodar os comandos:
+
+.. code-block:: sh
+
+  # gerar a imagem
+  sudo docker build -t ugcnormal .
+  # verificar se gerou
+  sudo docker images
+  # instanciar imagem
+  sudo docker run --name ugcnormal -d -p 5000:5000 --env "UGCNORMAL=./ugc_norm/speller" ugcnormal
+  # conferir processo rodando
+  sudo docker ps -a
+  
+  # para parar o container olhe o nome dele no docker ps -a e execute
+  sudo docker stop ugcnormal
+  # para remover um container (precisa parar primeiro)
+  sudo docker rm ugcnormal
+
+Exemplos de uso
+---------------
+
+Basta fazer um POST da mensagem a ser normalizada na url /reply passando a mensagem no campo "message" e o método no campo "method".
+
+Métodos disponíveis:
+
+* token: tokenizer
+* spell: speller
+* acronym: acronyms searcher
+* textese: untextese
+* proper_noun: proper noun normalizer
+
+A mensagem normalizada é retornada no campo "reply". O status da requisição no campo "status", tendo com valor padrão para sucesso "ok".
+
+Exemplo curl:
+
+.. code-block:: sh
+
+  curl -X POST \
+    http://localhost:5000/reply \
+    -H 'content-type: application/json; charset=utf-8' \
+    -d '{
+      "message": "oi td bm?",
+      "method": "spell"
+  }'
+
+Exemplo python3 nativo (http.client):
+
+.. code-block:: python
+
+  import http.client
+
+  conn = http.client.HTTPConnection("localhost:5000")
+
+  payload = "{\"message\": \"oi td bm?\", \"method\": \"spell\"}"
+
+  headers = {
+      'content-type': "application/json; charset=utf-8"
+  }
+
+  conn.request("POST", "/reply", payload, headers)
+  res = conn.getresponse()
+  data = res.read()
+
+  print(data.decode("utf-8"))
diff --git a/app.py b/app.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+from flask import Flask, request, jsonify
+from normalizer import Normalizer
+
+APP = Flask(__name__)
+APP.config['JSON_AS_ASCII'] = False # retrieve UTF-8 messages
+
+NORM = Normalizer()
+
+@APP.route('/reply', methods=['POST'])
+def reply():
+    params = request.json
+    if not params:
+        return jsonify({
+            "status": "error",
+            "error": "Request must be of the application/json type!",
+        })
+
+    message = params.get("message")
+    method = params.get("method")
+
+    # Make sure the required params are present.
+    if message is None or method is None:
+        return jsonify({
+            "status": "error",
+            "error": "message and method are required keys"
+        })
+
+    methods = {'token':NORM.tokenizer,
+               'spell':NORM.speller,
+               'acronym':NORM.acronym_searcher,
+               'textese':NORM.untextese,
+               'proper_noun':NORM.proper_noun_normalizer
+              }
+
+    try:
+        reply = methods[method](message)
+    except KeyError:
+        return jsonify({
+            "status": "error",
+            "error": "method not valid, try one of the following: token, spell, acronym, textese or proper_noun"
+        })
+
+    # Send the response.
+    return jsonify({
+        "status": "ok",
+        "reply": reply
+    })
diff --git a/normalizer.py b/normalizer.py
@@ -0,0 +1,50 @@
+from io import open
+from subprocess import PIPE, Popen
+
+class Normalizer(object):
+    def file_save(self, text):
+        norm_file = open("./temp/file.txt", mode="w", encoding="utf-8")
+        norm_file.write(text.decode('utf-8'))
+        norm_file.close()
+
+    def tokenizer(self, text):
+        #print ("Aplicando o tokenizador...")
+        echo = Popen(['echo', text], stdout=PIPE)
+        process = Popen(['./ugc_norm/tokenizer/webtok'], stdin=echo.stdout, stdout=PIPE)
+        output = process.communicate()[0]
+        return output
+
+    def speller(self, text):
+        tokens = self.tokenizer(text)
+        #print ("Aplicando o speller...")
+        self.file_save(tokens)
+        actual_direcory = Popen('pwd', shell=False, stdout=PIPE)
+        previous_path = actual_direcory.communicate()[0]
+        command = 'perl ./ugc_norm/speller/spell.pl -stat ./ugc_norm/speller/lexicos/regra+cb_freq.txt -f ' + previous_path[:-1] + '/temp/file.txt'
+        process = Popen(command.split(), shell=False, stdout=PIPE)
+        output = process.communicate()[0]
+        return output
+
+    def acronym_searcher(self, text):
+        checked_text = self.speller(text)
+        #print ("Normalizando siglas...")
+        self.file_save(checked_text)
+        process = Popen('perl ./ugc_norm/siglas_map.pl ./ugc_norm/resources/lexico_siglas.txt ./temp/file.txt'.split(), shell=False, stdout=PIPE)
+        output = process.communicate()[0]
+        return output
+        
+    def untextese(self, text):
+        text_with_acronyms = self.acronym_searcher(text)
+        #print ("Normalizando internetes...")
+        self.file_save(text_with_acronyms)
+        process = Popen('perl ./ugc_norm/internetes_map.pl ./ugc_norm/resources/lexico_internetes.txt ./ugc_norm/resources/lexico_internetes_sigl_abrv.txt ./temp/file.txt'.split(), shell=False, stdout=PIPE)
+        output = process.communicate()[0]
+        return output
+
+    def proper_noun_normalizer(self, text):
+        without_textese = self.untextese(text)
+        #print ("Normalizando nomes proprios...")
+        self.file_save(without_textese)
+        process = Popen('perl ./ugc_norm/np_map.pl ./ugc_norm/resources/lexico_nome_proprio.txt ./temp/file.txt'.split(), shell=False, stdout=PIPE)
+        output = process.communicate()[0]
+        return output
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+numpy
+scipy
+Flask==0.12.2
+gunicorn==19.7.1
+multiprocessing
+nltk
+sklearn
diff --git a/temp/file.txt b/temp/file.txt