Scripts keyword
From Mondothèque
- !/usr/bin/env python
- -*- coding: utf-8 -*-
import os from random import * import time import nltk
- VARIABLES
now = time.strftime("%d-%m-%Y à %H:%M:%S") selected_text = []
- FUNCTIONS
- Check if sentence contains keyword, if so, generate fake paragraps with these sentences
def split_sentences():
with open("4_inventions_a_faire.txt") as f: text = f.read() # Split text into sentences with help of nltk sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sentences = sent_tokenizer.tokenize(text) return sentences
def select_sentences(sentences):
for sentence in sentences: wordlist = sentence.split(" ") for word in wordlist: i = 0 if word in keywords: i += 1 sentence = sentence+' ' if i == randint(1,6): sentence = sentence+"\n\n" i = 0 selected_text.append(sentence) return selected_text
- Write to file
def writetofile(content): try: logfile = open("sentences.txt", "a") try: logfile.write(content) finally: logfile.close() except IOError: pass
- keywords
keywords = ['machine', 'machines']
- OU
- keywords = []
- for line in open("lelivre_extrait.txt"):
- for word in line.split():
- if word.endswith('ing'):
- keywords.append(word)
- SCRIPT
- split text into sentences
sentences = split_sentences()
- select sentences based on keywords
selected_text = select_sentences(sentences)
- write to new file
writetofile('Traîté de la Documentation\n') writetofile('Paul Otlet & Henri Lafontaine\n') writetofile('Sélection à base de ' + ', '.join(keywords) + '\n\n\n\n') for sentence in selected_text:
writetofile(sentence)
writetofile('\n\n\nCe texte a été généré le '+now + '.')