Scripts keyword

From Mondothèque

  1. !/usr/bin/env python
  2. -*- coding: utf-8 -*-

import os from random import * import time import nltk


  1. VARIABLES

now = time.strftime("%d-%m-%Y à %H:%M:%S") selected_text = []


  1. FUNCTIONS
  1. Check if sentence contains keyword, if so, generate fake paragraps with these sentences

def split_sentences():

   with open("4_inventions_a_faire.txt") as f:
       text = f.read()
   # Split text into sentences with help of nltk
       sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
       sentences = sent_tokenizer.tokenize(text)
   return sentences
   

def select_sentences(sentences):

   for sentence in sentences:
       wordlist = sentence.split(" ")
       for word in wordlist:
           i = 0
           if word in keywords:
               i += 1
               sentence = sentence+' '
               if i == randint(1,6):
                   sentence = sentence+"\n\n"
                   i = 0
               selected_text.append(sentence)
   return selected_text


  1. Write to file

def writetofile(content): try: logfile = open("sentences.txt", "a") try: logfile.write(content) finally: logfile.close() except IOError: pass


  1. keywords

keywords = ['machine', 'machines']

  1. OU
  2. keywords = []
  3. for line in open("lelivre_extrait.txt"):
  4. for word in line.split():
  5. if word.endswith('ing'):
  6. keywords.append(word)


  1. SCRIPT
  1. split text into sentences

sentences = split_sentences()

  1. select sentences based on keywords

selected_text = select_sentences(sentences)

  1. write to new file

writetofile('Traîté de la Documentation\n') writetofile('Paul Otlet & Henri Lafontaine\n') writetofile('Sélection à base de ' + ', '.join(keywords) + '\n\n\n\n') for sentence in selected_text:

   writetofile(sentence)

writetofile('\n\n\nCe texte a été généré le '+now + '.')