|
|
@ -1,7 +1,5 @@
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
# Apache 2.0
|
|
|
|
# Apache 2.0
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import argparse
|
|
|
|
import codecs
|
|
|
|
import codecs
|
|
|
|
import sys
|
|
|
|
import sys
|
|
|
@ -12,15 +10,13 @@ is_python2 = sys.version_info[0] == 2
|
|
|
|
def get_parser():
|
|
|
|
def get_parser():
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="filter words in a text file",
|
|
|
|
description="filter words in a text file",
|
|
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
|
|
|
|
)
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
parser.add_argument(
|
|
|
|
"--exclude",
|
|
|
|
"--exclude",
|
|
|
|
"-v",
|
|
|
|
"-v",
|
|
|
|
dest="exclude",
|
|
|
|
dest="exclude",
|
|
|
|
action="store_true",
|
|
|
|
action="store_true",
|
|
|
|
help="exclude filter words",
|
|
|
|
help="exclude filter words", )
|
|
|
|
)
|
|
|
|
|
|
|
|
parser.add_argument("filt", type=str, help="filter list")
|
|
|
|
parser.add_argument("filt", type=str, help="filter list")
|
|
|
|
parser.add_argument("infile", type=str, help="input file")
|
|
|
|
parser.add_argument("infile", type=str, help="input file")
|
|
|
|
return parser
|
|
|
|
return parser
|
|
|
@ -37,29 +33,20 @@ def filter_file(infile, filt, exclude):
|
|
|
|
for line in vocabfile:
|
|
|
|
for line in vocabfile:
|
|
|
|
vocab.add(line.strip())
|
|
|
|
vocab.add(line.strip())
|
|
|
|
|
|
|
|
|
|
|
|
sys.stdout = codecs.getwriter("utf-8")(
|
|
|
|
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
|
|
|
|
sys.stdout if is_python2 else sys.stdout.buffer
|
|
|
|
if is_python2 else sys.stdout.buffer)
|
|
|
|
)
|
|
|
|
|
|
|
|
with codecs.open(infile, "r", encoding="utf-8") as textfile:
|
|
|
|
with codecs.open(infile, "r", encoding="utf-8") as textfile:
|
|
|
|
for line in textfile:
|
|
|
|
for line in textfile:
|
|
|
|
if exclude:
|
|
|
|
if exclude:
|
|
|
|
print(
|
|
|
|
print(" ".join(
|
|
|
|
" ".join(
|
|
|
|
map(
|
|
|
|
map(
|
|
|
|
lambda word: word if word not in vocab else "",
|
|
|
|
lambda word: word if word not in vocab else "",
|
|
|
|
line.strip().split(), )))
|
|
|
|
line.strip().split(),
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
print(
|
|
|
|
print(" ".join(
|
|
|
|
" ".join(
|
|
|
|
map(
|
|
|
|
map(
|
|
|
|
lambda word: word if word in vocab else "<UNK>",
|
|
|
|
lambda word: word if word in vocab else "<UNK>",
|
|
|
|
line.strip().split(), )))
|
|
|
|
line.strip().split(),
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if __name__ == "__main__":
|
|
|
|