Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
clowdflows
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Alain Shakour
clowdflows
Commits
3227f614
Commit
3227f614
authored
Apr 23, 2015
by
hiphop
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
single files splitting
parent
9152c5f7
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
74 additions
and
16 deletions
+74
-16
workflows/nlp/library.py
workflows/nlp/library.py
+74
-16
No files found.
workflows/nlp/library.py
View file @
3227f614
...
...
@@ -101,6 +101,7 @@ def nlp_totrtale2(input_dict, widget):
from
xml.dom.minidom
import
parseString
import
time
import
math
import
copy
progress_accumulator
=
0
widget
.
progress
=
progress_accumulator
...
...
@@ -108,19 +109,20 @@ def nlp_totrtale2(input_dict, widget):
processes
=
4
DOCUMENTS_SIZE
=
3
*
int
(
1e6
)
#Document size (MB) per process
SINGLE_DOC_SIZE
=
1
*
int
(
1e6
)
corpus
=
parseString
(
input_dict
[
'corpus'
])
language
=
input_dict
[
'lang'
],
postprocess
=
input_dict
[
'postprocess'
]
==
"true"
bohoricica
=
input_dict
[
'bohoricica'
]
==
"true"
antique
=
input_dict
[
'antique'
]
==
"true"
xml
=
input_dict
[
'xml'
]
==
"true"
params
=
{
"language"
:
language
,
"postprocess"
:
postprocess
,
"bohoricica"
:
bohoricica
,
"antique"
:
antique
,
"xml"
:
xml
}
tei_corpus
=
corpus
.
getElementsByTagName
(
'teiCorpus'
)
if
tei_corpus
:
tei_head
=
'<?xml version="1.0" encoding="utf-8"?>
\n
'
+
\
...
...
@@ -132,19 +134,57 @@ def nlp_totrtale2(input_dict, widget):
documents
=
corpus
.
getElementsByTagName
(
'TEI'
)
documents_size
,
document_num
,
process_num
=
0
,
0
,
1
results
,
docs
=
[],
[]
results
,
docs
,
single_docs
=
[],
[],
[]
for
i
,
document
in
enumerate
(
documents
):
docs
.
append
(
document
.
toxml
())
documents_size
+=
len
(
document
.
getElementsByTagName
(
'body'
)[
0
].
getElementsByTagName
(
'p'
)[
0
].
childNodes
[
0
].
nodeValue
)
document_num
+=
1
if
documents_size
>
DOCUMENTS_SIZE
or
(
document_num
)
%
10
==
0
or
i
==
len
(
documents
)
-
1
:
#print "Log:",process_num, "process added to queue with", document_num, "documents"
documents_size
=
0
document_num
=
0
params
[
"text"
]
=
"
\n
"
.
join
(
docs
)
results
.
append
(
pool
.
apply_async
(
totrtale_request
,
args
=
[
params
]))
process_num
+=
1
docs
=
[]
doc_len
=
len
(
document
.
getElementsByTagName
(
'body'
)[
0
].
getElementsByTagName
(
'p'
)[
0
].
childNodes
[
0
].
nodeValue
)
if
doc_len
>
SINGLE_DOC_SIZE
:
print
"document was split"
predhead
=
'<TEI xmlns="http://www.tei-c.org/ns/1.0">
\n
'
head
=
'<text>
\n
<body>
\n
<p>
\n
'
header
=
document
.
getElementsByTagName
(
'teiHeader'
)[
0
].
toxml
()
+
"
\n
"
tail
=
'
\n
</p>
\n
</body>
\n
</text>
\n
</TEI>'
document_text
=
document
.
getElementsByTagName
(
'body'
)[
0
].
getElementsByTagName
(
'p'
)[
0
].
childNodes
[
0
].
nodeValue
.
strip
().
replace
(
"&"
,
"&"
).
replace
(
"<"
,
"<"
).
replace
(
">"
,
">"
).
replace
(
"
\"
"
,
"""
)
prev_j
,
curr_j
=
0
,
SINGLE_DOC_SIZE
while
(
curr_j
+
2
)
<
len
(
document_text
):
while
(
curr_j
+
2
)
<
len
(
document_text
)
and
document_text
[
curr_j
:
curr_j
+
2
]
!=
". "
:
curr_j
+=
1
sub_params
=
copy
.
deepcopy
(
params
)
sub_params
[
"text"
]
=
predhead
+
head
+
document_text
[
prev_j
:
curr_j
+
2
]
+
tail
results
.
append
(
pool
.
apply_async
(
totrtale_request
,
args
=
[
sub_params
]))
if
prev_j
==
0
:
single_docs
.
append
(
0
)
else
:
single_docs
.
append
(
1
)
prev_j
=
curr_j
+
2
curr_j
+=
SINGLE_DOC_SIZE
document_num
+=
1
process_num
+=
1
if
curr_j
>
doc_len
:
sub_params
=
copy
.
deepcopy
(
params
)
sub_params
[
"text"
]
=
predhead
+
head
+
document_text
[
prev_j
:]
+
tail
results
.
append
(
pool
.
apply_async
(
totrtale_request
,
args
=
[
sub_params
]))
document_num
+=
1
process_num
+=
1
single_docs
.
append
(
2
)
else
:
print
"whole document was added"
docs
.
append
(
document
.
toxml
())
document_num
+=
1
documents_size
+=
doc_len
if
documents_size
>
DOCUMENTS_SIZE
or
(
document_num
)
%
10
==
0
or
i
==
len
(
documents
)
-
1
:
#print "Log:",process_num, "process added to queue with", document_num, "documents"
documents_size
=
0
document_num
=
0
sub_params
=
copy
.
deepcopy
(
params
)
sub_params
[
"text"
]
=
"
\n
"
.
join
(
docs
)
results
.
append
(
pool
.
apply_async
(
totrtale_request
,
args
=
[
sub_params
]))
process_num
+=
1
docs
=
[]
single_docs
.
append
(
-
1
)
pool
.
close
()
response
=
[
""
for
i
in
results
]
...
...
@@ -153,15 +193,33 @@ def nlp_totrtale2(input_dict, widget):
while
any
(
progress
):
time
.
sleep
(
1
)
progress
=
[
not
result
.
ready
()
for
result
in
results
]
print
progress
for
i
,
prog
in
enumerate
(
progress
):
if
not
prog
and
response
[
i
]
==
""
:
resp
=
json
.
loads
(
results
[
i
].
get
().
content
)[
u
'runToTrTaLeResponse'
][
u
'runToTrTaLeResult'
]
if
resp
[
"error"
]
!=
""
:
progress
=
[
False
]
raise
Exception
(
resp
[
"error"
])
response
[
i
]
=
resp
[
"resp"
]
if
single_docs
[
i
]
==
0
:
print
"remove back"
,
i
pos1
=
resp
[
"resp"
].
find
(
"<s>"
)
pos2
=
resp
[
"resp"
].
find
(
"</p>"
)
response
[
i
]
=
predhead
+
header
+
head
+
resp
[
"resp"
][
pos1
:
pos2
]
elif
single_docs
[
i
]
==
2
:
print
"remove front"
,
i
pos1
=
resp
[
"resp"
].
find
(
"<s>"
)
response
[
i
]
=
resp
[
"resp"
][
pos1
:]
elif
single_docs
[
i
]
==
1
:
print
"remove both"
,
i
pos1
=
resp
[
"resp"
].
find
(
"<s>"
)
pos2
=
resp
[
"resp"
].
find
(
"</p>"
)
response
[
i
]
=
resp
[
"resp"
][
pos1
:
pos2
]
else
:
print
"nothing to remove"
response
[
i
]
=
resp
[
"resp"
]
progress_accumulator
+=
1
/
float
(
len
(
results
))
*
100
#print "progress", progress_accumulator, math.floor(progress_accumulator)
widget
.
progress
=
math
.
floor
(
progress_accumulator
)
widget
.
save
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment