Browse Source

2018-02-16: bachelor's thesis

Simon Lackerbauer 3 months ago
parent
commit
d888ff24cb
Signed by: Simon Lackerbauer <simon@lackerbauer.com> GPG Key ID: 2B27C889039C0125
15 changed files with 3319 additions and 0 deletions
  1. 941
    0
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/bibliography.bib
  2. 145
    0
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/dbstmpl.sty
  3. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/dummy_pattern.png
  4. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/dummy_random_walk.png
  5. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/example_pattern.png
  6. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/fullgraph.png
  7. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/partial_graph.png
  8. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/small_pattern.png
  9. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/sphx_glr_plot_separating_hyperplane_0011.png
  10. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/kopf.pdf
  11. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/presentation.pdf
  12. 285
    0
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/presentation.tex
  13. BIN
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/thesis.pdf
  14. 657
    0
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/thesis.tex
  15. 1291
    0
      2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/utcaps.bst

+ 941
- 0
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/bibliography.bib View File

@@ -0,0 +1,941 @@
1
+% Generated by Paperpile. Check out http://paperpile.com for more information.
2
+% BibTeX export options can be customized via Settings -> BibTeX.
3
+
4
+@TECHREPORT{Saarinen2015-vh,
5
+  title    = "The {BLAKE2} cryptographic hash and message authentication code
6
+              ({MAC})",
7
+  author   = "Saarinen, Markku Juhani and Aumasson, Jean-Philippe",
8
+  number   = "RFC 7693",
9
+  year     =  2015,
10
+  keywords = "compsci;Efficient Event Classification through Constrained
11
+              Subgraph Mining"
12
+}
13
+
14
+@TECHREPORT{Yan2002-hg,
15
+  title       = "gSpan: {Graph-Based} Substructure Pattern Mining, Expanded
16
+                 Version",
17
+  author      = "Yan, Xifeng and Han, Jiawei",
18
+  number      = "UIUCDCS-R-2002-2296",
19
+  institution = "UIUC Technical Report",
20
+  year        =  2002,
21
+  keywords    = "compsci;Efficient Event Classification through Constrained
22
+                 Subgraph Mining"
23
+}
24
+
25
+@ARTICLE{Chang2011-wa,
26
+  title     = "{LIBSVM}: A Library for Support Vector Machines",
27
+  author    = "Chang, Chih-Chung and Lin, Chih-Jen",
28
+  journal   = "ACM Trans. Intell. Syst. Technol.",
29
+  publisher = "ACM",
30
+  volume    =  2,
31
+  number    =  3,
32
+  pages     = "27:1--27:27",
33
+  month     =  may,
34
+  year      =  2011,
35
+  address   = "New York, NY, USA",
36
+  keywords  = "Classification LIBSVM optimization regression support vector
37
+               machines SVM;compsci;Efficient Event Classification through
38
+               Constrained Subgraph Mining"
39
+}
40
+
41
+@MISC{noauthor_undated-io,
42
+  title        = "1.4. Support Vector Machines --- scikit-learn 0.19.1
43
+                  documentation",
44
+  howpublished = "\url{http://scikit-learn.org/stable/modules/svm.html}",
45
+  note         = "Accessed: 2018-2-4",
46
+  keywords     = "compsci;Efficient Event Classification through Constrained
47
+                  Subgraph Mining"
48
+}
49
+
50
+@ARTICLE{Pedregosa2011-ld,
51
+  title    = "Scikit-learn: Machine Learning in Python",
52
+  author   = "Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre
53
+              and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and
54
+              Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and
55
+              Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and
56
+              Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and
57
+              Duchesnay, {\'E}douard",
58
+  journal  = "J. Mach. Learn. Res.",
59
+  volume   =  12,
60
+  number   = "Oct",
61
+  pages    = "2825--2830",
62
+  year     =  2011,
63
+  keywords = "compsci;Efficient Event Classification through Constrained
64
+              Subgraph Mining"
65
+}
66
+
67
+@INPROCEEDINGS{Han1999-bj,
68
+  title     = "Efficient mining of partial periodic patterns in time series
69
+               database",
70
+  booktitle = "Proceedings 15th International Conference on Data Engineering
71
+               (Cat. {No.99CB36337})",
72
+  author    = "Han, Jiawei and Dong, Guozhu and Yin, Yiwen",
73
+  pages     = "106--115",
74
+  month     =  mar,
75
+  year      =  1999,
76
+  keywords  = "data mining;statistical databases;time series;data mining;hit
77
+               set property;partial periodicity search;periodicity
78
+               search;time-series databases;Algorithm design and
79
+               analysis;Cities and towns;Computer science;Councils;Data
80
+               analysis;Data mining;Databases;Read only
81
+               memory;Sun;compsci;Efficient Event Classification through
82
+               Constrained Subgraph Mining"
83
+}
84
+
85
+@ARTICLE{Zaki2001-jy,
86
+  title     = "{SPADE}: An Efficient Algorithm for Mining Frequent Sequences",
87
+  author    = "Zaki, Mohammed J",
88
+  journal   = "Mach. Learn.",
89
+  publisher = "Kluwer Academic Publishers",
90
+  volume    =  42,
91
+  number    = "1-2",
92
+  pages     = "31--60",
93
+  month     =  jan,
94
+  year      =  2001,
95
+  keywords  = "compsci;Efficient Event Classification through Constrained
96
+               Subgraph Mining",
97
+  language  = "en"
98
+}
99
+
100
+@ARTICLE{Han2004-qs,
101
+  title     = "From sequential pattern mining to structured pattern mining: A
102
+               pattern-growth approach",
103
+  author    = "Han, Jia-Wei and Pei, Jian and Yan, Xi-Feng",
104
+  journal   = "J. Comput. Sci. \& Technol.",
105
+  publisher = "Science Press",
106
+  volume    =  19,
107
+  number    =  3,
108
+  pages     = "257--279",
109
+  month     =  may,
110
+  year      =  2004,
111
+  keywords  = "compsci;Efficient Event Classification through Constrained
112
+               Subgraph Mining",
113
+  language  = "en"
114
+}
115
+
116
+@ARTICLE{Pei2002-ud,
117
+  title     = "Constrained Frequent Pattern Mining: A Pattern-growth View",
118
+  author    = "Pei, Jian and Han, Jiawei",
119
+  journal   = "SIGKDD Explor. Newsl.",
120
+  publisher = "ACM",
121
+  volume    =  4,
122
+  number    =  1,
123
+  pages     = "31--39",
124
+  month     =  jun,
125
+  year      =  2002,
126
+  address   = "New York, NY, USA",
127
+  keywords  = "compsci;Efficient Event Classification through Constrained
128
+               Subgraph Mining"
129
+}
130
+
131
+@ARTICLE{Han2000-ut,
132
+  title     = "Mining frequent patterns by pattern-growth: methodology and
133
+               implications",
134
+  author    = "Han, Jiawei and Pei, Jian",
135
+  journal   = "ACM SIGKDD Explorations Newsletter",
136
+  publisher = "ACM",
137
+  volume    =  2,
138
+  number    =  2,
139
+  pages     = "14--20",
140
+  month     =  dec,
141
+  year      =  2000,
142
+  keywords  = "associations; constraint-based mining; frequent patterns;
143
+               scalable data mining methods and algorithms; sequential
144
+               patterns;compsci;Efficient Event Classification through
145
+               Constrained Subgraph Mining"
146
+}
147
+
148
+@INPROCEEDINGS{Srikant1996-dy,
149
+  title     = "Mining Quantitative Association Rules in Large Relational Tables",
150
+  booktitle = "Proceedings of the 1996 {ACM} {SIGMOD} International Conference
151
+               on Management of Data",
152
+  author    = "Srikant, Ramakrishnan and Agrawal, Rakesh",
153
+  publisher = "ACM",
154
+  pages     = "1--12",
155
+  series    = "SIGMOD '96",
156
+  year      =  1996,
157
+  address   = "New York, NY, USA",
158
+  keywords  = "compsci;Efficient Event Classification through Constrained
159
+               Subgraph Mining"
160
+}
161
+
162
+@ARTICLE{Kossinets2006-rw,
163
+  title       = "Empirical analysis of an evolving social network",
164
+  author      = "Kossinets, Gueorgi and Watts, Duncan J",
165
+  affiliation = "Department of Sociology and Institute for Social and Economic
166
+                 Research and Policy, Columbia University, 420 West 118th
167
+                 Street, MC 3355, New York, NY 10027, USA. gk297@columbia.edu",
168
+  journal     = "Science",
169
+  publisher   = "science.sciencemag.org",
170
+  volume      =  311,
171
+  number      =  5757,
172
+  pages       = "88--90",
173
+  month       =  jan,
174
+  year        =  2006,
175
+  keywords    = "socsci;Efficient Event Classification through Constrained
176
+                 Subgraph Mining",
177
+  language    = "en"
178
+}
179
+
180
+@ARTICLE{Cortes1995-ix,
181
+  title     = "Support-vector networks",
182
+  author    = "Cortes, Corinna and Vapnik, Vladimir",
183
+  journal   = "Mach. Learn.",
184
+  publisher = "Kluwer Academic Publishers",
185
+  volume    =  20,
186
+  number    =  3,
187
+  pages     = "273--297",
188
+  month     =  sep,
189
+  year      =  1995,
190
+  keywords  = "compsci;Efficient Event Classification through Constrained
191
+               Subgraph Mining",
192
+  language  = "en"
193
+}
194
+
195
+@MISC{noauthor_undated-bv,
196
+  title        = "Apache Hadoop - Open source software for reliable, scalable,
197
+                  distributed computing",
198
+  howpublished = "\url{http://hadoop.apache.org/}",
199
+  note         = "Accessed: 2018-2-2",
200
+  keywords     = "misc;Efficient Event Classification through Constrained
201
+                  Subgraph Mining"
202
+}
203
+
204
+@MISC{noauthor_undated-zu,
205
+  title        = "Elasticsearch - Open Source Search \& Analytics",
206
+  howpublished = "\url{https://www.elastic.co/}",
207
+  note         = "Accessed: 2018-2-2",
208
+  keywords     = "misc;Efficient Event Classification through Constrained
209
+                  Subgraph Mining"
210
+}
211
+
212
+@MISC{noauthor_undated-vl,
213
+  title        = "Grafana - The open platform for analytics and monitoring",
214
+  booktitle    = "Grafana Labs",
215
+  howpublished = "\url{https://grafana.com/}",
216
+  note         = "Accessed: 2018-2-2",
217
+  keywords     = "misc;Efficient Event Classification through Constrained
218
+                  Subgraph Mining"
219
+}
220
+
221
+@MISC{noauthor_undated-xi,
222
+  title        = "Graylog - Open Source Log Management",
223
+  howpublished = "\url{https://www.graylog.org/}",
224
+  note         = "Accessed: 2018-2-2",
225
+  keywords     = "misc;Efficient Event Classification through Constrained
226
+                  Subgraph Mining"
227
+}
228
+
229
+@ARTICLE{Travers1967-cn,
230
+  title     = "The small world problem",
231
+  author    = "Travers, Jeffrey and Milgram, Stanley",
232
+  journal   = "Phychology Today",
233
+  publisher = "JSTOR",
234
+  volume    =  1,
235
+  number    =  1,
236
+  pages     = "61--67",
237
+  year      =  1967,
238
+  keywords  = "socsci;Efficient Event Classification through Constrained
239
+               Subgraph Mining"
240
+}
241
+
242
+@BOOK{Newman2010-ac,
243
+  title     = "Networks: An Introduction",
244
+  author    = "Newman, Mark",
245
+  publisher = "Oxford University Press",
246
+  month     =  mar,
247
+  year      =  2010,
248
+  keywords  = "compsci;Efficient Event Classification through Constrained
249
+               Subgraph Mining",
250
+  language  = "en"
251
+}
252
+
253
+@INPROCEEDINGS{Cook1971-fj,
254
+  title     = "The complexity of theorem-proving procedures",
255
+  booktitle = "Proceedings of the third annual {ACM} symposium on Theory of
256
+               computing",
257
+  author    = "Cook, Stephen A",
258
+  publisher = "ACM",
259
+  pages     = "151--158",
260
+  month     =  may,
261
+  year      =  1971,
262
+  keywords  = "compsci;Efficient Event Classification through Constrained
263
+               Subgraph Mining"
264
+}
265
+
266
+@MISC{Rahtz_undated-bv,
267
+  title        = "{TeX} Live - {TeX} Users Group",
268
+  author       = "Rahtz, Sebastian and Kakuto, Akira and Berry, Karl and
269
+                  Scarso, Luigi and Miklavec, Mojka and Preining, Norbert and
270
+                  Kotucha, Reinhard and Kroonenberg, Siep and Wawrykiewicz,
271
+                  Staszek",
272
+  howpublished = "\url{https://www.tug.org/texlive/}",
273
+  note         = "Accessed: 2018-2-2",
274
+  keywords     = "misc;Efficient Event Classification through Constrained
275
+                  Subgraph Mining"
276
+}
277
+
278
+@MISC{Van_der_Zander_undated-kf,
279
+  title        = "{TeXstudio}",
280
+  author       = "van der Zander, Benito",
281
+  howpublished = "\url{http://www.texstudio.org/}",
282
+  note         = "Accessed: 2018-2-2",
283
+  keywords     = "misc;Efficient Event Classification through Constrained
284
+                  Subgraph Mining"
285
+}
286
+
287
+@ARTICLE{Shannon2003-gg,
288
+  title       = "Cytoscape: a software environment for integrated models of
289
+                 biomolecular interaction networks",
290
+  author      = "Shannon, Paul and Markiel, Andrew and Ozier, Owen and Baliga,
291
+                 Nitin S and Wang, Jonathan T and Ramage, Daniel and Amin, Nada
292
+                 and Schwikowski, Benno and Ideker, Trey",
293
+  affiliation = "Institute for Systems Biology, Seattle, Washington 98103, USA.",
294
+  journal     = "Genome Res.",
295
+  volume      =  13,
296
+  number      =  11,
297
+  pages       = "2498--2504",
298
+  month       =  nov,
299
+  year        =  2003,
300
+  keywords    = "misc;Efficient Event Classification through Constrained
301
+                 Subgraph Mining",
302
+  language    = "en"
303
+}
304
+
305
+@ARTICLE{Garey1976-po,
306
+  title    = "Some simplified {NP-complete} graph problems",
307
+  author   = "Garey, M R and Johnson, D S and Stockmeyer, L",
308
+  journal  = "Theor. Comput. Sci.",
309
+  volume   =  1,
310
+  number   =  3,
311
+  pages    = "237--267",
312
+  month    =  feb,
313
+  year     =  1976,
314
+  keywords = "maths;Efficient Event Classification through Constrained Subgraph
315
+              Mining"
316
+}
317
+
318
+@TECHREPORT{Fortin1996-la,
319
+  title       = "The graph isomorphism problem",
320
+  author      = "Fortin, Scott",
321
+  number      = "96-20",
322
+  institution = "University of Alberta",
323
+  year        =  1996,
324
+  keywords    = "maths;Efficient Event Classification through Constrained
325
+                 Subgraph Mining"
326
+}
327
+
328
+@ARTICLE{Dragoni2016-fh,
329
+  title         = "Microservices: yesterday, today, and tomorrow",
330
+  author        = "Dragoni, Nicola and Giallorenzo, Saverio and Lafuente,
331
+                   Alberto Lluch and Mazzara, Manuel and Montesi, Fabrizio and
332
+                   Mustafin, Ruslan and Safina, Larisa",
333
+  month         =  jun,
334
+  year          =  2016,
335
+  keywords      = "compsci;Efficient Event Classification through Constrained
336
+                   Subgraph Mining",
337
+  archivePrefix = "arXiv",
338
+  primaryClass  = "cs.SE",
339
+  eprint        = "1606.04036"
340
+}
341
+
342
+@ARTICLE{Peters1993-fw,
343
+  title    = "The history and development of transaction log analysis",
344
+  author   = "Peters, Thomas A",
345
+  journal  = "Library Hi Tech",
346
+  volume   =  11,
347
+  number   =  2,
348
+  pages    = "41--66",
349
+  year     =  1993,
350
+  keywords = "compsci;Efficient Event Classification through Constrained
351
+              Subgraph Mining"
352
+}
353
+
354
+@ARTICLE{Agrawal1993-nc,
355
+  title     = "Mining Association Rules Between Sets of Items in Large
356
+               Databases",
357
+  author    = "Agrawal, Rakesh and Imieli{\'n}ski, Tomasz and Swami, Arun",
358
+  journal   = "SIGMOD Rec.",
359
+  publisher = "ACM",
360
+  volume    =  22,
361
+  number    =  2,
362
+  pages     = "207--216",
363
+  month     =  jun,
364
+  year      =  1993,
365
+  address   = "New York, NY, USA",
366
+  keywords  = "compsci;Efficient Event Classification through Constrained
367
+               Subgraph Mining"
368
+}
369
+
370
+@INPROCEEDINGS{Agrawal1994-ca,
371
+  title     = "Fast algorithms for mining association rules",
372
+  booktitle = "Proc. 20th int. conf. very large data bases, {VLDB}",
373
+  author    = "Agrawal, Rakesh and Srikant, Ramakrishnan and {Others}",
374
+  volume    =  1215,
375
+  pages     = "487--499",
376
+  year      =  1994,
377
+  keywords  = "compsci;Efficient Event Classification through Constrained
378
+               Subgraph Mining"
379
+}
380
+
381
+@INPROCEEDINGS{Pei2000-rz,
382
+  title      = "Mining Access Patterns Efficiently from Web Logs",
383
+  booktitle  = "Knowledge Discovery and Data Mining. Current Issues and New
384
+                Applications",
385
+  author     = "Pei, Jian and Han, Jiawei and Mortazavi-asl, Behzad and Zhu,
386
+                Hua",
387
+  publisher  = "Springer, Berlin, Heidelberg",
388
+  pages      = "396--407",
389
+  month      =  apr,
390
+  year       =  2000,
391
+  keywords   = "compsci;Efficient Event Classification through Constrained
392
+                Subgraph Mining",
393
+  language   = "en",
394
+  conference = "Pacific-Asia Conference on Knowledge Discovery and Data Mining"
395
+}
396
+
397
+@ARTICLE{Jung2017-zs,
398
+  title         = "When is Network Lasso Accurate?",
399
+  author        = "Jung, Alexander",
400
+  month         =  apr,
401
+  year          =  2017,
402
+  keywords      = "stats;Efficient Event Classification through Constrained
403
+                   Subgraph Mining",
404
+  archivePrefix = "arXiv",
405
+  primaryClass  = "stat.ML",
406
+  eprint        = "1704.02107"
407
+}
408
+
409
+@ARTICLE{Haghiri2017-mk,
410
+  title         = "Comparison Based Nearest Neighbor Search",
411
+  author        = "Haghiri, Siavash and Ghoshdastidar, Debarghya and von
412
+                   Luxburg, Ulrike",
413
+  month         =  apr,
414
+  year          =  2017,
415
+  keywords      = "stats;Efficient Event Classification through Constrained
416
+                   Subgraph Mining",
417
+  archivePrefix = "arXiv",
418
+  primaryClass  = "stat.ML",
419
+  eprint        = "1704.01460"
420
+}
421
+
422
+@ARTICLE{Lei2017-ip,
423
+  title         = "{Cross-Validation} with Confidence",
424
+  author        = "Lei, Jing",
425
+  month         =  mar,
426
+  year          =  2017,
427
+  keywords      = "stats;Efficient Event Classification through Constrained
428
+                   Subgraph Mining",
429
+  archivePrefix = "arXiv",
430
+  primaryClass  = "stat.ME",
431
+  eprint        = "1703.07904"
432
+}
433
+
434
+@INPROCEEDINGS{Ringsquandl2016-en,
435
+  title           = "Knowledge Graph Constraints for Multi-label Graph
436
+                     Classification",
437
+  booktitle       = "2016 {IEEE} 16th International Conference on Data Mining
438
+                     Workshops ({ICDMW})",
439
+  author          = "Ringsquandl, Martin and Lamparter, Steffen and Thon, Ingo
440
+                     and Lepratti, Raffaello and Kroger, Peer",
441
+  publisher       = "IEEE",
442
+  pages           = "121--127",
443
+  year            =  2016,
444
+  keywords        = "compsci;Efficient Event Classification through Constrained
445
+                     Subgraph Mining",
446
+  conference      = "2016 IEEE 16th International Conference on Data Mining
447
+                     Workshops (ICDMW)"
448
+}
449
+
450
+@ARTICLE{De_Graaf2007-hh,
451
+  title         = "Clustering with Lattices in the Analysis of Graph Patterns",
452
+  author        = "de Graaf, Edgar H and Kok, Joost N and Kosters, Walter A",
453
+  month         =  may,
454
+  year          =  2007,
455
+  keywords      = "compsci;Efficient Event Classification through Constrained
456
+                   Subgraph Mining",
457
+  archivePrefix = "arXiv",
458
+  primaryClass  = "cs.AI",
459
+  eprint        = "0705.0593"
460
+}
461
+
462
+@ARTICLE{Hallac2015-uk,
463
+  title       = "Network Lasso: Clustering and Optimization in Large Graphs",
464
+  author      = "Hallac, David and Leskovec, Jure and Boyd, Stephen",
465
+  affiliation = "Stanford University. Stanford University. Stanford University.",
466
+  journal     = "KDD",
467
+  volume      =  2015,
468
+  pages       = "387--396",
469
+  month       =  aug,
470
+  year        =  2015,
471
+  keywords    = "ADMM; Convex Optimization; Network Lasso;compsci;Efficient
472
+                 Event Classification through Constrained Subgraph Mining",
473
+  language    = "en"
474
+}
475
+
476
+@ARTICLE{Wei2017-ak,
477
+  title         = "A Joint Framework for Argumentative Text Analysis
478
+                   Incorporating Domain Knowledge",
479
+  author        = "Wei, Zhongyu and Li, Chen and Liu, Yang",
480
+  month         =  jan,
481
+  year          =  2017,
482
+  keywords      = "compsci;Efficient Event Classification through Constrained
483
+                   Subgraph Mining",
484
+  archivePrefix = "arXiv",
485
+  primaryClass  = "cs.CL",
486
+  eprint        = "1701.05343"
487
+}
488
+
489
+@ARTICLE{Bayer2017-af,
490
+  title         = "Graph Based Relational Features for Collective
491
+                   Classification",
492
+  author        = "Bayer, Immanuel and Nagel, Uwe and Rendle, Steffen",
493
+  month         =  feb,
494
+  year          =  2017,
495
+  keywords      = "compsci;Efficient Event Classification through Constrained
496
+                   Subgraph Mining",
497
+  archivePrefix = "arXiv",
498
+  primaryClass  = "cs.IR",
499
+  eprint        = "1702.02817"
500
+}
501
+
502
+@ARTICLE{Dhiman2016-wo,
503
+  title    = "Optimizing Frequent Subgraph Mining for Single Large Graph",
504
+  author   = "Dhiman, Aarzoo and Jain, S K",
505
+  journal  = "Procedia Comput. Sci.",
506
+  volume   =  89,
507
+  pages    = "378--385",
508
+  year     =  2016,
509
+  keywords = "Frequent Subgraph Mining; Graph; Optimization; Single Graph;
510
+              Subgraph Isomorphism;compsci;Efficient Event Classification
511
+              through Constrained Subgraph Mining"
512
+}
513
+
514
+@INPROCEEDINGS{Dhiman2016-jq,
515
+  title           = "Frequent subgraph mining algorithms for single large
516
+                     graphs --- A brief survey",
517
+  booktitle       = "2016 International Conference on Advances in Computing,
518
+                     Communication, \& Automation ({ICACCA}) (Spring)",
519
+  author          = "Dhiman, Aarzoo and Jain, S K",
520
+  publisher       = "IEEE",
521
+  pages           = "1--6",
522
+  month           =  apr,
523
+  year            =  2016,
524
+  keywords        = "compsci;Efficient Event Classification through Constrained
525
+                     Subgraph Mining",
526
+  conference      = "2016 International Conference on Advances in Computing,
527
+                     Communication, \& Automation (ICACCA) (Spring)"
528
+}
529
+
530
+@INPROCEEDINGS{Zou2010-ze,
531
+  title     = "Frequent subgraph mining on a single large graph using sampling
532
+               techniques",
533
+  booktitle = "Proceedings of the Eighth Workshop on Mining and Learning with
534
+               Graphs",
535
+  author    = "Zou, Ruoyu and Holder, Lawrence B",
536
+  publisher = "ACM",
537
+  pages     = "171--178",
538
+  month     =  jul,
539
+  year      =  2010,
540
+  keywords  = "graph mining; large graph; sampling;compsci;Efficient Event
541
+               Classification through Constrained Subgraph Mining"
542
+}
543
+
544
+@INCOLLECTION{Moussaoui2016-ng,
545
+  title     = "{POSGRAMI}: Possibilistic Frequent Subgraph Mining in a Single
546
+               Large Graph",
547
+  booktitle = "Information Processing and Management of Uncertainty in
548
+               {Knowledge-Based} Systems",
549
+  author    = "Moussaoui, Mohamed and Zaghdoud, Montaceur and Akaichi, Jalel",
550
+  editor    = "Carvalho, Joao Paulo and Lesot, Marie-Jeanne and Kaymak, Uzay
551
+               and Vieira, Susana and Bouchon-Meunier, Bernadette and Yager,
552
+               Ronald R",
553
+  publisher = "Springer International Publishing",
554
+  volume    =  610,
555
+  pages     = "549--561",
556
+  series    = "Communications in Computer and Information Science",
557
+  year      =  2016,
558
+  address   = "Cham",
559
+  keywords  = "compsci;Efficient Event Classification through Constrained
560
+               Subgraph Mining"
561
+}
562
+
563
+@INPROCEEDINGS{Elseidy2014-fz,
564
+  title     = "{GRAMI}: Frequent Subgraph and Pattern Mining in a Single Large
565
+               Graph",
566
+  booktitle = "Proceedings of the {VLDB} Endowment",
567
+  author    = "Elseidy, Mohammed and Abdelhamid, Ehab and Skiadopoulos, Spiros
568
+               and Kalnis, Panos",
569
+  month     =  mar,
570
+  year      =  2014,
571
+  keywords  = "compsci;Efficient Event Classification through Constrained
572
+               Subgraph Mining"
573
+}
574
+
575
+@INPROCEEDINGS{Yan2003-dl,
576
+  title      = "{CloseGraph}: Mining Closed Frequent Graph Patterns",
577
+  booktitle  = "Proceedings of the ninth {ACM} {SIGKDD} international
578
+                conference on Knowledge discovery and data mining",
579
+  author     = "Yan, Xifeng and Han, Jiawei",
580
+  year       =  2003,
581
+  keywords   = "compsci;Efficient Event Classification through Constrained
582
+                Subgraph Mining",
583
+  conference = "KDD '03"
584
+}
585
+
586
+@INPROCEEDINGS{Han2007-qx,
587
+  title     = "Frequent pattern mining: current status and future directions",
588
+  booktitle = "Proceedings of the fifteenth {ACM} {SIGKDD} international
589
+               conference on Knowledge discovery and data mining",
590
+  author    = "Han, Jiawei and Cheng, Hong and Xin, Dong and Yan, Xifeng",
591
+  publisher = "Kluwer Academic Publishers-Plenum Publishers",
592
+  volume    =  15,
593
+  pages     = "55--86",
594
+  month     =  aug,
595
+  year      =  2007,
596
+  keywords  = "compsci;Efficient Event Classification through Constrained
597
+               Subgraph Mining",
598
+  language  = "en"
599
+}
600
+
601
+@INPROCEEDINGS{Yan2002-sj,
602
+  title      = "gSpan: graph-based substructure pattern mining",
603
+  booktitle  = "2002 {IEEE} International Conference on Data Mining, 2002.
604
+                Proceedings.",
605
+  author     = "Yan, Xifeng and Han, Jiawei",
606
+  publisher  = "IEEE Comput. Soc",
607
+  pages      = "721--724",
608
+  year       =  2002,
609
+  keywords   = "data mining;tree searching;algorithm;canonical
610
+                label;depth-first search strategy;frequent connected subgraph
611
+                mining;frequent graph-based pattern mining;frequent
612
+                substructure discovery;gSpan;graph datasets;graph-based
613
+                substructure pattern mining;lexicographic order;performance
614
+                study;unique minimum DFS code;Chemical compounds;Computer
615
+                science;Costs;Data mining;Data
616
+                structures;Graphics;Itemsets;Kernel;Testing;Tree
617
+                graphs;compsci;Efficient Event Classification through
618
+                Constrained Subgraph Mining",
619
+  conference = "2002 IEEE International Conference on Data Mining. ICDM 2002"
620
+}
621
+
622
+@BOOK{Ojeda2014-lq,
623
+  title     = "Practical Data Science Cookbook",
624
+  author    = "Ojeda, Tony and Murphy, Sean Patrick and Bengfort, Benjamin and
625
+               Dasgupta, Abhijit",
626
+  publisher = "Packt Publishing Ltd",
627
+  month     =  sep,
628
+  year      =  2014,
629
+  keywords  = "stats;Efficient Event Classification through Constrained
630
+               Subgraph Mining",
631
+  language  = "en"
632
+}
633
+
634
+@BOOK{Cuesta2013-ha,
635
+  title     = "Practical Data Analysis",
636
+  author    = "Cuesta, Hector",
637
+  publisher = "Packt Publishing",
638
+  year      =  2013,
639
+  keywords  = "stats;Efficient Event Classification through Constrained
640
+               Subgraph Mining"
641
+}
642
+
643
+@BOOK{Coelho2015-wo,
644
+  title     = "Building Machine Learning Systems with Python - Second Edition",
645
+  author    = "Coelho, Luis Pedro and Richert, Willi",
646
+  publisher = "Packt Publishing Ltd",
647
+  month     =  mar,
648
+  year      =  2015,
649
+  keywords  = "compsci;Efficient Event Classification through Constrained
650
+               Subgraph Mining",
651
+  language  = "en"
652
+}
653
+
654
+@INPROCEEDINGS{Crouch2013-qe,
655
+  title           = "Dynamic Graphs in the {Sliding-Window} Model?",
656
+  booktitle       = "Algorithms -- {ESA} 2013 Proceedings",
657
+  author          = "Crouch, Michael S and Mc Gregor, Andrew and Stubbs, Daniel",
658
+  year            =  2013,
659
+  keywords        = "compsci;Efficient Event Classification through Constrained
660
+                     Subgraph Mining",
661
+  conference      = "Algorithms -- ESA 2013"
662
+}
663
+
664
+@INPROCEEDINGS{Kyrola2012-wx,
665
+  title     = "{GraphChi}: {Large-Scale} Graph Computation on Just a {PC}",
666
+  booktitle = "{OSDI}",
667
+  author    = "Kyrola, Aapo and Blelloch, Guy E and Guestrin, Carlos and
668
+               {Others}",
669
+  volume    =  12,
670
+  pages     = "31--46",
671
+  year      =  2012,
672
+  keywords  = "compsci;Efficient Event Classification through Constrained
673
+               Subgraph Mining"
674
+}
675
+
676
+@ARTICLE{Grone1994-qc,
677
+  title     = "The Laplacian spectrum of a graph {II}",
678
+  author    = "Grone, Robert and Merris, Russell",
679
+  journal   = "SIAM J. Discrete Math.",
680
+  publisher = "SIAM",
681
+  volume    =  7,
682
+  number    =  2,
683
+  pages     = "221--229",
684
+  year      =  1994,
685
+  keywords  = "maths;Efficient Event Classification through Constrained
686
+               Subgraph Mining"
687
+}
688
+
689
+@ARTICLE{Grone1990-fa,
690
+  title     = "The Laplacian Spectrum of a Graph",
691
+  author    = "Grone, Robert and Merris, Russell and Sunder, V S",
692
+  journal   = "SIAM J. Matrix Anal. Appl.",
693
+  publisher = "SIAM",
694
+  volume    =  11,
695
+  number    =  2,
696
+  pages     = "218--238",
697
+  year      =  1990,
698
+  keywords  = "maths;Efficient Event Classification through Constrained
699
+               Subgraph Mining"
700
+}
701
+
702
+@ARTICLE{Mohar1991-lx,
703
+  title     = "The Laplacian spectrum of graphs",
704
+  author    = "Mohar, Bojan and Alavi, Y and Chartrand, G and Oellermann, O R",
705
+  journal   = "Graph theory, combinatorics, and applications",
706
+  publisher = "academia.edu",
707
+  volume    =  2,
708
+  number    = "871-898",
709
+  pages     = "12",
710
+  year      =  1991,
711
+  keywords  = "maths;Efficient Event Classification through Constrained
712
+               Subgraph Mining"
713
+}
714
+
715
+@BOOK{Han2006-on,
716
+  title     = "Data Mining: Concepts and Techniques",
717
+  author    = "Han, Jiawei and Kamber, Micheline",
718
+  publisher = "Morgan Kaufinann",
719
+  edition   =  2,
720
+  year      =  2006,
721
+  keywords  = "compsci;Efficient Event Classification through Constrained
722
+               Subgraph Mining"
723
+}
724
+
725
+@ARTICLE{Yan2005-nz,
726
+  title     = "Mining closed relational graphs with connectivity constraints",
727
+  author    = "Yan, X and Zhou, X and Han, J",
728
+  journal   = "Proceedings of the eleventh ACM SIGKDD",
729
+  publisher = "dl.acm.org",
730
+  year      =  2005,
731
+  keywords  = "compsci;Efficient Event Classification through Constrained
732
+               Subgraph Mining"
733
+}
734
+
735
+@INPROCEEDINGS{Kong2013-ug,
736
+  title     = "Multi-label Classification by Mining Label and Instance
737
+               Correlations from Heterogeneous Information Networks",
738
+  booktitle = "Proceedings of the 19th {ACM} {SIGKDD} International Conference
739
+               on Knowledge Discovery and Data Mining",
740
+  author    = "Kong, Xiangnan and Cao, Bokai and Yu, Philip S",
741
+  publisher = "ACM",
742
+  pages     = "614--622",
743
+  series    = "KDD '13",
744
+  year      =  2013,
745
+  address   = "New York, NY, USA",
746
+  keywords  = "data mining, heterogeneous information network, label
747
+               correlation, multi-label classification;compsci;Efficient Event
748
+               Classification through Constrained Subgraph Mining"
749
+}
750
+
751
+@INPROCEEDINGS{Kong2012-fj,
752
+  title     = "Meta Path-based Collective Classification in Heterogeneous
753
+               Information Networks",
754
+  booktitle = "Proceedings of the 21st {ACM} International Conference on
755
+               Information and Knowledge Management",
756
+  author    = "Kong, Xiangnan and Yu, Philip S and Ding, Ying and Wild, David J",
757
+  publisher = "ACM",
758
+  pages     = "1567--1571",
759
+  series    = "CIKM '12",
760
+  year      =  2012,
761
+  address   = "New York, NY, USA",
762
+  keywords  = "heterogeneous information networks, meta path;compsci;Efficient
763
+               Event Classification through Constrained Subgraph Mining"
764
+}
765
+
766
+@BOOK{Han2011-ms,
767
+  title     = "Data Mining: Concepts and Techniques",
768
+  author    = "Han, Jiawei and Pei, Jian and Kamber, Micheline",
769
+  publisher = "Elsevier",
770
+  edition   =  3,
771
+  month     =  jun,
772
+  year      =  2011,
773
+  keywords  = "compsci;Efficient Event Classification through Constrained
774
+               Subgraph Mining",
775
+  language  = "en"
776
+}
777
+
778
+@INPROCEEDINGS{Deville2016-pp,
779
+  title      = "{GriMa}: A Grid Mining Algorithm for {Bag-of-Grid-Based}
780
+                Classification",
781
+  booktitle  = "Structural, Syntactic, and Statistical Pattern Recognition",
782
+  author     = "Deville, Romain and Fromont, Elisa and Jeudy, Baptiste and
783
+                Solnon, Christine",
784
+  editor     = "Robles-Kelly, Antonio and Loog, Marco and Biggio, Battista and
785
+                Escolano, Francisco and Wilson, Richard",
786
+  publisher  = "Springer International Publishing",
787
+  pages      = "132--142",
788
+  series     = "Lecture Notes in Computer Science",
789
+  month      =  nov,
790
+  year       =  2016,
791
+  keywords   = "compsci;Efficient Event Classification through Constrained
792
+                Subgraph Mining",
793
+  language   = "en",
794
+  conference = "Joint IAPR International Workshops on Statistical Techniques in
795
+                Pattern Recognition (SPR) and Structural and Syntactic Pattern
796
+                Recognition (SSPR)"
797
+}
798
+
799
+@ARTICLE{Bianchi2015-wu,
800
+  title     = "Granular Computing Techniques for Classification and Semantic
801
+               Characterization of Structured Data",
802
+  author    = "Bianchi, Filippo Maria and Scardapane, Simone and Rizzi,
803
+               Antonello and Uncini, Aurelio and Sadeghian, Alireza",
804
+  journal   = "Cognit. Comput.",
805
+  publisher = "Springer US",
806
+  volume    =  8,
807
+  number    =  3,
808
+  pages     = "442--461",
809
+  month     =  dec,
810
+  year      =  2015,
811
+  keywords  = "compsci;Efficient Event Classification through Constrained
812
+               Subgraph Mining",
813
+  language  = "en"
814
+}
815
+
816
+@ARTICLE{Hu2016-uv,
817
+  title     = "An aerial image recognition framework using discrimination and
818
+               redundancy quality measure",
819
+  author    = "Hu, Yuxing and Nie, Liqiang",
820
+  journal   = "J. Vis. Commun. Image Represent.",
821
+  publisher = "Elsevier",
822
+  volume    =  37,
823
+  pages     = "53--62",
824
+  year      =  2016,
825
+  keywords  = "Aerial image; Categorization; Discriminative; Subgraph; Data
826
+               mining; Image recognition; Framework; Quality
827
+               measure;compsci;Efficient Event Classification through
828
+               Constrained Subgraph Mining"
829
+}
830
+
831
+@INPROCEEDINGS{Arora2010-bc,
832
+  title     = "Sentiment Classification Using Automatically Extracted Subgraph
833
+               Features",
834
+  booktitle = "Proceedings of the {NAACL} {HLT} 2010 Workshop on Computational
835
+               Approaches to Analysis and Generation of Emotion in Text",
836
+  author    = "Arora, Shilpa and Mayfield, Elijah and Penstein-Ros{\'e},
837
+               Carolyn and Nyberg, Eric",
838
+  publisher = "Association for Computational Linguistics",
839
+  pages     = "131--139",
840
+  series    = "CAAGET '10",
841
+  year      =  2010,
842
+  address   = "Stroudsburg, PA, USA",
843
+  keywords  = "compsci;Efficient Event Classification through Constrained
844
+               Subgraph Mining"
845
+}
846
+
847
+@ARTICLE{Conte2004-ki,
848
+  title     = "{THIRTY} {YEARS} {OF} {GRAPH} {MATCHING} {IN} {PATTERN}
849
+               {RECOGNITION}",
850
+  author    = "Conte, D and Foggia, P and Sansone, C and Vento, M",
851
+  journal   = "Int. J. Pattern Recognit Artif Intell.",
852
+  publisher = "World Scientific",
853
+  volume    =  18,
854
+  number    =  03,
855
+  pages     = "265--298",
856
+  year      =  2004,
857
+  keywords  = "compsci;Efficient Event Classification through Constrained
858
+               Subgraph Mining"
859
+}
860
+
861
+@INPROCEEDINGS{Nguyen2009-ts,
862
+  title     = "Graph-based Mining of Multiple Object Usage Patterns",
863
+  booktitle = "Proceedings of the the 7th Joint Meeting of the European
864
+               Software Engineering Conference and the {ACM} {SIGSOFT}
865
+               Symposium on The Foundations of Software Engineering",
866
+  author    = "Nguyen, Tung Thanh and Nguyen, Hoan Anh and Pham, Nam H and
867
+               Al-Kofahi, Jafar M and Nguyen, Tien N",
868
+  publisher = "ACM",
869
+  pages     = "383--392",
870
+  series    = "ESEC/FSE '09",
871
+  year      =  2009,
872
+  address   = "New York, NY, USA",
873
+  keywords  = "anomaly, api usage, clone, graph mining, groum, object usage,
874
+               pattern;compsci;Efficient Event Classification through
875
+               Constrained Subgraph Mining"
876
+}
877
+
878
+@ARTICLE{Washio2003-fc,
879
+  title     = "State of the Art of Graph-based Data Mining",
880
+  author    = "Washio, Takashi and Motoda, Hiroshi",
881
+  journal   = "SIGKDD Explor. Newsl.",
882
+  publisher = "ACM",
883
+  volume    =  5,
884
+  number    =  1,
885
+  pages     = "59--68",
886
+  month     =  jul,
887
+  year      =  2003,
888
+  address   = "New York, NY, USA",
889
+  keywords  = "data mining, graph, graph-based data mining, path, structured
890
+               data, tree;compsci;Efficient Event Classification through
891
+               Constrained Subgraph Mining"
892
+}
893
+
894
+@INPROCEEDINGS{Cheng2009-td,
895
+  title     = "Identifying Bug Signatures Using Discriminative Graph Mining",
896
+  booktitle = "Proceedings of the Eighteenth International Symposium on
897
+               Software Testing and Analysis",
898
+  author    = "Cheng, Hong and Lo, David and Zhou, Yang and Wang, Xiaoyin and
899
+               Yan, Xifeng",
900
+  publisher = "ACM",
901
+  pages     = "141--152",
902
+  series    = "ISSTA '09",
903
+  year      =  2009,
904
+  address   = "New York, NY, USA",
905
+  keywords  = "bug signature, discriminative subgraph mining;compsci;Efficient
906
+               Event Classification through Constrained Subgraph Mining"
907
+}
908
+
909
+@INPROCEEDINGS{Zhang2009-wg,
910
+  title     = "{GADDI}: Distance Index Based Subgraph Matching in Biological
911
+               Networks",
912
+  booktitle = "Proceedings of the 12th International Conference on Extending
913
+               Database Technology: Advances in Database Technology",
914
+  author    = "Zhang, Shijie and Li, Shirong and Yang, Jiong",
915
+  publisher = "ACM",
916
+  pages     = "192--203",
917
+  series    = "EDBT '09",
918
+  year      =  2009,
919
+  address   = "New York, NY, USA",
920
+  keywords  = "compsci;Efficient Event Classification through Constrained
921
+               Subgraph Mining"
922
+}
923
+
924
+@INPROCEEDINGS{Jin2011-vz,
925
+  title     = "{LTS}: Discriminative subgraph mining by learning from search
926
+               history",
927
+  booktitle = "2011 {IEEE} 27th International Conference on Data Engineering",
928
+  author    = "Jin, N and Wang, W",
929
+  publisher = "ieeexplore.ieee.org",
930
+  pages     = "207--218",
931
+  month     =  apr,
932
+  year      =  2011,
933
+  keywords  = "data mining;graph theory;greedy algorithms;learning (artificial
934
+               intelligence);pattern classification;branch and bound
935
+               algorithm;discriminative subgraph mining method;graph
936
+               classifier;graph indices;greedy algorithm;learning to
937
+               search;Accuracy;Algorithm design and analysis;Chemical
938
+               compounds;Classification algorithms;Frequency
939
+               estimation;History;Kernel;compsci;Efficient Event Classification
940
+               through Constrained Subgraph Mining"
941
+}

+ 145
- 0
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/dbstmpl.sty View File

@@ -0,0 +1,145 @@
1
+%%
2
+%% dbstmpl.sty
3
+%% von Stefan Brecheisen, LFE fuer Datenbanksysteme am
4
+%% Institut fuer Informatik der LMU Muenchen
5
+%%
6
+%% kleine Anpassungen an das aktuelle CD von Marisa Thoma
7
+%%
8
+%% Dieses Latex-Paket definiert sinnvolle Einstellungen und hilfreiche
9
+%% Befehle zum Erstellen der Ausarbeitung einer Diplom- oder Projektarbeit
10
+%% an der LFE fuer Datenbanksysteme.
11
+%%
12
+\def\fileversion{v1.1}
13
+\def\filedate{2010/06/21}
14
+
15
+\NeedsTeXFormat{LaTeX2e}
16
+
17
+\ProvidesPackage{dbstmpl}[\filedate\space\fileversion]
18
+
19
+%%
20
+%% benoetigte Pakete
21
+%%
22
+%% Wenn es englisch sein soll:
23
+%\RequirePackage{lmodern,polyglossia}
24
+\RequirePackage[german,english]{babel}
25
+%% sonst: für deutsche Arbeiten:
26
+%\RequirePackage[english,german]{babel}
27
+\RequirePackage{german}
28
+\RequirePackage{amsmath}
29
+\RequirePackage{amssymb}
30
+\RequirePackage{geometry}
31
+\RequirePackage{fancyhdr}
32
+\RequirePackage[nottoc]{tocbibind}
33
+\RequirePackage{graphicx}
34
+
35
+%%
36
+%% haeufig benutzte Symbole
37
+%%
38
+\newcommand{\N}{\mathbb{N}}    % Menge der natuerlichen Zahlen
39
+\newcommand{\Z}{\mathbb{Z}}    % Menge der ganzen Zahlen
40
+\newcommand{\Q}{\mathbb{Q}}    % Menge der rationalen Zahlen
41
+\newcommand{\R}{\mathbb{R}}    % Menge der reellen Zahlen
42
+\newcommand{\C}{\mathbb{C}}    % Menge der komplexen Zahlen
43
+
44
+%%
45
+%% Einstellungen
46
+%%
47
+
48
+% Seitenraender
49
+\geometry{body={140mm,210mm},footskip=12mm}
50
+
51
+% Gestaltung der Kopf- und Fusszeilen
52
+\pagestyle{fancy}
53
+\headheight 14pt
54
+\fancyhf{}
55
+\fancyhead[L]{\small\slshape\leftmark}
56
+\fancyfoot[C]{\thepage}
57
+
58
+% subsubsections numerieren und ins Inhaltsverzeichnis aufnehmen
59
+\setcounter{secnumdepth}{3}
60
+\setcounter{tocdepth}{3}
61
+
62
+%%
63
+%% Globale Variablen
64
+%%
65
+\newtoks\arbeit             % Art der Arbeit
66
+\newtoks\fach               % Studiengang
67
+\newtoks\titel              % Titel der Arbeit
68
+\newtoks\bearbeiter         % Name des Bearbeiters
69
+\newtoks\betreuer           % Name des Betreuers
70
+\newtoks\aufgabensteller    % Name des Aufgabenstellers
71
+\newtoks\abgabetermin       % Datum der Abgabe
72
+\newtoks\ort                % Wohnort des Bearbeiters
73
+
74
+%%
75
+%% Ausgabe des Deckblatts fuer eine Diplom- oder Projektarbeit.
76
+%%
77
+\newcommand{\deckblatt}{
78
+  \begin{titlepage}
79
+  ~
80
+  \vspace{-2cm}
81
+  \begin{center}
82
+   \parbox[t]{145.5mm}{ \includegraphics[width=145.5mm]{kopf} }
83
+  \end{center}
84
+    
85
+  \begin{center}
86
+  
87
+      \vspace{2.5cm}\Large
88
+      \the\arbeit
89
+    
90
+      {\large in \the\fach}
91
+
92
+      \vspace{1cm}\huge
93
+      \the\titel
94
+
95
+      \vspace{1cm}\large
96
+      \the\bearbeiter
97
+
98
+      \vspace{\fill}\normalsize
99
+      \begin{tabular}{ll}
100
+        Aufgabensteller: & \the\aufgabensteller\\
101
+        Betreuer:        & \the\betreuer\\
102
+        Abgabedatum:     & \the\abgabetermin
103
+      \end{tabular}
104
+    
105
+    \end{center}
106
+  
107
+  \end{titlepage}
108
+}
109
+
110
+%%
111
+%% Ausgabe der Erklaerung ueber die selbstaendige Anfertigung
112
+%% einer Diplomarbeit
113
+%%
114
+\newcommand{\erklaerung}{
115
+  \begin{titlepage}
116
+    \vspace*{\fill}
117
+    \parindent 0cm
118
+    \begin{center}
119
+      \textbf{Erkl"arung}
120
+
121
+      \vspace{1cm}
122
+  
123
+      \begin{minipage}{9.8cm}
124
+        Hiermit versichere ich, dass ich diese \the\arbeit\ selbst\"andig verfasst und keine anderen als die angegebenen Quellen und Hilfsmittel verwendet habe.
125
+
126
+        \vspace{1cm}
127
+        \the\ort, den \the\abgabetermin
128
+
129
+        \vspace{1.5cm}
130
+        \makebox[9.8cm]{\dotfill}\\
131
+        \the\bearbeiter
132
+
133
+      \end{minipage}
134
+  
135
+    \end{center}
136
+  
137
+    \vspace*{\fill}
138
+  \end{titlepage}
139
+}
140
+
141
+\newcommand{\emptypage}{
142
+\begin{titlepage}
143
+\vspace*{\fill}
144
+\end{titlepage}
145
+}

BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/dummy_pattern.png View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/dummy_random_walk.png View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/example_pattern.png View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/fullgraph.png View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/partial_graph.png View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/small_pattern.png View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/images/sphx_glr_plot_separating_hyperplane_0011.png View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/kopf.pdf View File


BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/presentation.pdf View File


+ 285
- 0
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/presentation.tex View File

@@ -0,0 +1,285 @@
1
+\documentclass{beamer}
2
+%\setbeameroption{show only notes}
3
+\usepackage{polyglossia}
4
+\usetheme{default} %minimal
5
+\setbeamercovered{transparent}
6
+\setbeamertemplate{bibliography item}{}
7
+\setbeamertemplate{caption}[numbered]
8
+\setbeamercolor*{bibliography entry title}{fg=black}
9
+\setbeamercolor*{bibliography entry author}{fg=black}
10
+\setbeamercolor*{bibliography entry location}{fg=black}
11
+\setbeamercolor*{bibliography entry note}{fg=black}
12
+\usepackage{natbib}
13
+\usepackage{tikz}
14
+\bibliographystyle{plain}
15
+\renewcommand\bibfont{\scriptsize}
16
+\beamertemplatenavigationsymbolsempty
17
+
18
+\AtBeginSection[]
19
+{
20
+  \begin{frame}<beamer>
21
+    \frametitle{Outline}
22
+    \tableofcontents[currentsection]
23
+  \end{frame}
24
+}
25
+
26
+
27
+\title{Efficient Event Classification through Constrained Subgraph Mining}
28
+\subtitle{Abschlussvortrag Bachelorarbeit}
29
+
30
+\author{Simon Lackerbauer}
31
+
32
+\institute[Ludwig-Maximilians-Universität München]
33
+{
34
+}
35
+
36
+\date{2018-04-23}
37
+
38
+\subject{}
39
+
40
+\AtBeginSubsection[]
41
+{
42
+  \begin{frame}<beamer>{Outline}
43
+    \tableofcontents[currentsection,currentsubsection]
44
+  \end{frame}
45
+}
46
+
47
+
48
+\begin{document}
49
+  
50
+  \begin{frame}
51
+    \titlepage
52
+  \end{frame}
53
+  
54
+  \begin{frame}{Outline}
55
+    \tableofcontents
56
+  \end{frame}
57
+  
58
+  \section{Problemstellung}
59
+  
60
+  \begin{frame}{Problemstellung}{}
61
+    \begin{itemize}
62
+      \item Datenherkunft: Industrielle Fertigungsstrecke von Siemens
63
+      \item Datenart: Fehlermeldungen der verschiedenen Fertigungsmodule
64
+      \item Daten sind proprietär, deshalb wurde ein zusätzliches, synthetisches Datenset konstruiert, das bei den meisten Folien zum Einsatz kommt
65
+    \end{itemize}
66
+  \end{frame}
67
+
68
+\begin{frame}{Problemstellung}
69
+    \begin{itemize}
70
+        \item Die Anlage hat viele Ausfälle
71
+        \item Ziel war es, Patterns aus den Daten zu generieren, von denen auf die Ursprünge der Probleme beim Ablauf geschlossen werden kann
72
+        \item Mit diesen Patterns sollten die eigentlichen Anlagentechniker die Gründe der häufigen Ausfälle ausmachen und dementsprechend mitigieren können
73
+    \end{itemize}
74
+\end{frame}
75
+
76
+  \begin{frame}{Beispieldaten}
77
+\begin{table}[]
78
+    \centering
79
+    \caption{Synthetisches Datenset (Auszug)}
80
+    \label{table:dummy_messages}
81
+    \footnotesize 
82
+    \begin{tabular}{l|l|l|l}
83
+        time stamp & log message & module id & part id \\ \hline
84
+        2017-04-05 11:01:05 & Laser überhitzt            & Module 1 & 88495775TEST \\
85
+        2017-04-05 11:01:05 & Laser überhitzt            & Module 1 & 88495776TEST \\
86
+        2017-04-05 11:01:06 & Teil verkantet             & Module 2 & 88495776TEST \\
87
+        2017-04-05 11:01:06 & Laser überhitzt            & Module 1 & 88495776TEST \\
88
+        2017-04-05 11:01:10 & Laser überhitzt            & Module 1 & 88495776TEST \\
89
+        2017-04-05 11:01:12 & Auffangbehälter leeren     & Module 2 & 88495775TEST \\
90
+        2017-04-05 11:01:17 & Unbekannter Ausnahmefehler & Module 0 & 88495775TEST \\
91
+        2017-04-05 11:01:17 & Auffangbehälter leeren     & Module 2 & 88495775TEST \\
92
+        2017-04-05 11:01:19 & Unbekannter Ausnahmefehler & Module 0 & 88495775TEST \\
93
+        2017-04-05 11:05:22 & Laser überhitzt            & Module 1 & 88495775TEST \\
94
+        \multicolumn{1}{c}{\vdots} & \multicolumn{1}{c}{\vdots} & \multicolumn{1}{c}{\vdots} & \multicolumn{1}{c}{\vdots}
95
+    \end{tabular}
96
+\end{table}
97
+\end{frame}
98
+
99
+\begin{frame}{Problemstellung}
100
+Fehlermeldungen sind
101
+\begin{itemize}
102
+    \item komplett unstrukturiert
103
+    \item vollständig Deutsch
104
+    \item sehr kurz, bzw. keine vollständigen Sätze
105
+    \item teilweise nur für Experten verständlich
106
+\end{itemize}
107
+\end{frame}
108
+
109
+\begin{frame}{Evaluation}
110
+\begin{itemize}
111
+    \item Als weitere Metrik über die Anlage wurde die \textit{Overall Equipment Efficiency} (OEE) bereitgestellt
112
+    \item Der OEE-Score ist eine Größe zwischen 0 und 1, die sich folgendermaßen berechnet: $OEE = \frac{POK \cdot CT}{OT}$
113
+    \item Auf der OEE-Zeitreihe wurde eine Anomalie-Detektion durchgeführt
114
+    \item Die gefundenen Patterns sollten dann diese Anomalien vorhersagen
115
+\end{itemize}
116
+\end{frame}
117
+
118
+  \section{Vorausgegangene Ansätze und Idee}
119
+\begin{frame}{Sequenzpattern-Mining}
120
+\begin{itemize}
121
+    \item Sequenzen von \textit{frequent patterns} zu generieren, führte bereits zu kleinen Erfolgen
122
+    \item Die gefunden Patterns waren jedoch leider den Technikern mit Expertenwissen bereits bekannt
123
+\end{itemize}
124
+\end{frame}
125
+
126
+\begin{frame}{Erster Ansatz: Ein einzelner großer Graph}
127
+\begin{itemize}
128
+    \item Es gibt bereits Ansätze zum Mining von Patterns auf großen Graphen, (vgl. \textit{GRAMI}, Elseidy et al, 2014, und \textit{POSGRAMI}, Moussaoui et al, 2016)
129
+    \item Eine eigene Idee war, mittels der Suche nach kürzesten Pfaden (Dijkstra), längere, aufeinander aufbauende, und damit vermutlich kausal zusammenhängende, Pfade zu finden
130
+\end{itemize}
131
+\end{frame}
132
+
133
+  \begin{frame}{Darstellung großer Graph}
134
+\begin{figure}
135
+    \centering
136
+    \noindent\includegraphics[width=\linewidth]{images/fullgraph}
137
+    %\caption{Naive single graph encoding all available information}
138
+    \label{fig:fullgraph}
139
+\end{figure}
140
+\end{frame}
141
+
142
+\section{gSpan und SVM}
143
+\begin{frame}{Graph-Aufbau}
144
+\begin{itemize}
145
+    \item Die Daten wurden unter Verwendung von Wissen um den Anlagen-Aufbau als \textit{constraints} in eine Graph-Form gebracht
146
+    \item Jeder Graph enkodiert 5 Minuten an Informationen
147
+    \item Auf der Menge der generierten Graphen wird dann der \textit{gSpan}-Algorithmus zur Pattern-Suche ausgeführt
148
+\end{itemize}
149
+\end{frame}
150
+
151
+\begin{frame}{Graph-Isomorphismus}
152
+\begin{itemize}
153
+    \item Das Grundproblem beim Graph-Mining ist die Feststellung, ob zwei (Sub-)Graphen zueinander isomorph sind
154
+    \item Def.: Seien $G$ und $H$ Graphen. Sei $f: V(G) \rightarrow V(H)$ eine Bijektion und $u, v \in V(G), (u,v) \in E(G)$. Dann gilt $G \simeq H$ g.d.w $(f(u), f(v)) \in E(H)$.
155
+    \item Das Subgraph-Isomorphie-Problem ist NP-complete
156
+\end{itemize}
157
+\end{frame}
158
+
159
+\begin{frame}{gSpan}
160
+\begin{itemize}
161
+    \item \textit{gSpan} ist ein pattern-growth Algorithmus von \textit{Yan und Han} aus 2002
162
+    \item \textit{gSpan} weist jedem Graph ein kanonisches, auf DFS traversal basierendes Label zu (DFS-Codes)
163
+    \item Zwei Graphen mit gleichem Label sind isomorph
164
+    \item \textit{gSpan} findet sodann alle Subgraphen der Elemente einer Menge von Graphen, welche einen \textit{minimum support threshold} (\textit{min\_sup}) erreichen.
165
+\end{itemize}
166
+\end{frame}
167
+
168
+\begin{frame}{Modifikation von gSpan}
169
+\begin{itemize}
170
+    \item Beim Implementieren von \textit{gSpan} in Python fiel auf, dass die DFS-Codes ähnlich wie Hashes funktionieren, aber die verwendete Datenstruktur Vergleichsoperationen nicht sehr effizient macht
171
+    \item Leider kann gSpan nicht vollständig auf den reinen Vergleich von Hashes umgestellt werden, da über der Menge der DFS-Codes eine starke Totalordnung liegen muss
172
+\end{itemize}
173
+\end{frame}
174
+
175
+\begin{frame}{Beispiel DFS-Code}
176
+\begin{figure}
177
+    \centering
178
+    \begin{tikzpicture}[node distance = 2cm]
179
+    \tikzset{VertexStyle/.style = {
180
+            shape=circle,
181
+            draw=black
182
+    }}
183
+    \node[VertexStyle, label={[label distance=-.2cm]45:\small $v_1$}] (1){X};
184
+    \node[VertexStyle, right of= 1, label={[label distance=-.2cm]45:\small $v_2$}] (2){Y};	
185
+    \node[VertexStyle, right of= 2, label={[label distance=-.2cm]45:\small $v_3$}] (3){Z};
186
+    \node[VertexStyle, below of= 2, right of=2, label={[label distance=-.2cm]45:\small $v_4$}] (4){U};
187
+    
188
+    \path [-] (1) edge node[above] {a} (2);
189
+    \path [-] (2) edge node[above] {b} (3);
190
+    \path [-] (3) edge node[left] {c} (4);
191
+    \path [-] (2) edge node[left] {d} (4);
192
+    \end{tikzpicture}
193
+    %\caption[Graph $G$ from chapter \ref{chapter:theoretical_basis} with labels]{Graph $G$ from chapter \ref{chapter:theoretical_basis} with labels}
194
+    \label{fig:example_graph_dfs}
195
+\end{figure}
196
+\begin{table}[h]
197
+    \centering
198
+    %\caption{Minimum DFS code of graph $G$}
199
+    \label{table:dummy_min_dfs_codes}
200
+    \begin{tabular}{l|l}
201
+        edge no. & DFS code          \\ \hline
202
+        0        & $(0,2,U,d,Y)$     \\
203
+        1        & $(1, 2, X, a, Y)$ \\
204
+        2        & $(0, 3, U, c,Z)$  \\
205
+        3        & $(2, 3, Y, b, Z)$
206
+    \end{tabular}
207
+\end{table}
208
+\end{frame}
209
+
210
+\begin{frame}{Pattern-growth Aspekt}
211
+\begin{itemize}
212
+    \item Beim Suchen nach neuen Patterns verwendet \textit{gSpan} die schon gefundenen Patterns
213
+    \item Pattern-Kandidaten können neue Kanten nur am \textit{rightmost path} anfügen, was den Suchraum eingrenzt
214
+\end{itemize}
215
+\end{frame}
216
+
217
+\begin{frame}{Support Vector Machine}
218
+\begin{itemize}
219
+    \item Zum Klassifizieren der Patterns zu den gefundenen Anomalien wurde eine SVM eingesetzt
220
+    \item Eine SVM ist ein supervised learning Modell, das relativ effizient hochdimensionale Datenpunkte auf zwei Klassen verteilen kann
221
+\end{itemize}
222
+\end{frame}
223
+
224
+\section{Ergebnisse}
225
+
226
+\begin{frame}{Beispiel-Pattern}
227
+\begin{figure}
228
+    \centering
229
+    \includegraphics[width=1\linewidth]{images/dummy_pattern}
230
+    %\caption{8-edge pattern from the synthetic data set, min\_sup = .4}
231
+    \label{fig:dumm_pattern}
232
+\end{figure}
233
+\end{frame}
234
+
235
+\begin{frame}{Synthetischer OEE-Verlauf}
236
+
237
+\begin{figure}
238
+    \centering
239
+    \includegraphics[width=1\linewidth]{images/dummy_random_walk}
240
+    %\caption{Synthetic OEE values}
241
+    \label{fig:dummyrandomwalk}
242
+\end{figure}
243
+\end{frame}
244
+
245
+\begin{frame}{Laufzeiten synthetische Daten}
246
+\begin{table}[]
247
+    \centering
248
+    %\caption{Run times and patterns found (synthetic data set)}
249
+    \label{table:runtimes_syn}
250
+    \begin{tabular}{l|l|l}
251
+        data set                                      & \textit{t} & patterns \\ \hline
252
+        import errors and graph generation            & 1s         &          \\
253
+        import and anomalies detection on OEE scores  & 8s         &          \\ \hline
254
+        \textit{gSpan} (min\_sup = .7)                & 2s         & 40       \\
255
+        \textit{gSpan} (min\_sup = .6)                & 8s         & 106      \\
256
+        \textit{gSpan} (min\_sup = .5)                & 19s        & 241      \\
257
+        \textit{gSpan} (min\_sup = .4)                & 74s        & 1056     \\ \hline
258
+        SVM training and validation   (min\_sup = .7) & 4s         &          \\
259
+        SVM training and validation   (min\_sup = .6) & 8s         &          \\
260
+        SVM training and validation   (min\_sup = .5) & 35s        &          \\
261
+        SVM training and validation   (min\_sup = .4) & 13m 14s    &
262
+    \end{tabular}
263
+\end{table}
264
+The validation data set consisted of 49 time windows, 33 of which were deemed as a noticeable drop by the OEE evaluation algorithm. Of these 33, the SVM correctly identified 28 as drops, for a sensitivity score of 84.85\%. Of the remaining 19 non-drops, 5 were falsely identified as positives, for a specificity score of 73.68\%.
265
+\end{frame}
266
+
267
+\begin{frame}{Laufzeiten reale Anlagendaten}
268
+\begin{table}
269
+    \centering
270
+    %\caption{Run times and patterns found (facility data set)}
271
+    \label{table:runtimes_real} 
272
+    \begin{tabular}{l|l|l}
273
+        data set                                      & \textit{t}          & patterns \\ \hline
274
+        import errors and graph generation            & 50s                 &          \\
275
+        import and anomalies detection on OEE         & 2m 27s              &          \\ \hline
276
+        \textit{gSpan} (min\_sup = .9)                & 2m 20s              & 12       \\
277
+        \textit{gSpan} (min\_sup = .7)                & 6h 27m 12s          & 846      \\
278
+        \textit{gSpan} (min\_sup = .5)                & \textit{OOM killed} & --       \\ \hline
279
+        SVM training and validation   (min\_sup = .7) & 27s                 &
280
+    \end{tabular}
281
+\end{table}
282
+The validation data set consisted of 486 time windows, 64 of which were deemed as a noticeable drop by the OEE evaluation algorithm. Of these 64, the SVM trained on patterns with a min\_sup of .7 correctly identified 60 as drops, for a sensitivity score of 93.75\%. Of the remaining 422 non-drops, 18 were identified as false positives, for a specificity score of 95.73\%.
283
+\end{frame}
284
+
285
+\end{document}

BIN
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/thesis.pdf View File


+ 657
- 0
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/thesis.tex View File

@@ -0,0 +1,657 @@
1
+% warning: use pdflatex to compile!
2
+\documentclass[pdftex,12pt,a4paper]{report}
3
+\usepackage{dbstmpl}
4
+\usepackage{subfigure}
5
+\usepackage{graphicx}
6
+\usepackage{url}
7
+\usepackage{tikz}
8
+\usetikzlibrary{arrows,shapes,positioning}
9
+\usepackage{algorithm}
10
+\usepackage{algpseudocode}
11
+\algnewcommand\algorithmicforeach{\textbf{for each}}
12
+\algdef{S}[FOR]{ForEach}[1]{\algorithmicforeach\ #1\ \algorithmicdo}
13
+\usepackage{tabularx}
14
+
15
+\global\arbeit{Bachelorarbeit}
16
+\global\titel{Efficient Event Classification through Constrained Subgraph Mining}
17
+\global\bearbeiter{Simon Lackerbauer}
18
+\global\betreuer{Martin Ringsquandl}
19
+\global\aufgabensteller{Prof. Dr. Peer Kr"oger}
20
+\global\abgabetermin{16. Februar 2018}
21
+\global\ort{M"unchen}
22
+\global\fach{Informatik}
23
+
24
+\begin{document}
25
+
26
+\deckblatt
27
+\erklaerung
28
+
29
+\begin{abstract}
30
+With this work, I consider the problem of discovering patterns in error log data predicting common failure modes within a near fully-automated assembly line. I present a novel approach of encoding error log events in a graph structure and leverage a constraint based mining method to efficiently discover and score sophisticated patterns in these data, using a self implemented version of the pattern-growth algorithm \textit{gSpan}. As the algorithm as implemented does not scale quite as well as many traditional sequential pattern mining approaches, outside expert knowledge should be used to keep the input data to a manageable size and help with graph construction.
31
+\end{abstract}
32
+
33
+\tableofcontents
34
+
35
+\chapter{Introduction}
36
+The ability to analyze log files is a crucial part in the work of systems administrators and software developers. Developers often enough outright provoke the generation of detailed logs while debugging a piece of software, while system and network operators are routinely pulled from their efforts to fine tune services by incessant alerts from their monitoring systems.
37
+
38
+Indeed, transaction log files, a common subgroup of the more general event logs, have been used to diagnose errors in the workings of data processing systems nearly since their inception, and dedicated research efforts into the analysis of transaction log files can be traced back until at least the mid-1960s.\cite{Peters1993-fw}
39
+
40
+With the decline of monolithic service architectures, and the recent rise of complex systems of interdependent microservices,\cite{Dragoni2016-fh} the accurate reading of log files and recognition of the patterns within has become more important than ever. Nowadays, a plethora of log file management and search tools exist as both open source and commercially licensed tooling.\cite{noauthor_undated-bv}\cite{noauthor_undated-zu}\cite{noauthor_undated-vl}\cite{noauthor_undated-xi} More sophisticated log file analysis that goes beyond simple exploratory data analysis and simple monitoring and alerting systems is, however, not usually the focus of these tools.
41
+
42
+But not only modern microservice architectures need powerful log file analysis tools to understand where bottlenecks and emergent properties of the architecture might stem from. That's why, with this bachelor's thesis, I want to leverage these techniques with a slightly more traditional, yet similarly highly modularized, architecture in mind: an automated production line.
43
+
44
+Modern automated production lines usually consist of highly sophisticated robotic modules that assemble each produced unit with an efficiency and consistency that would be virtually impossible for human workers to achieve. However, at the same time, such automated systems are less tolerant of errors accumulating along the way and might stop working for relatively mundane reasons like a part being slightly out of position on the assembly line. Unlike a human worker, a specialized robot cannot solve most of these problems by itself. Depending on the problems encountered, the amount of time and (human) effort needed to deal with them can drive up the cost of running the line enormously, maybe even up to the point of operating at a loss.
45
+
46
+If problems like these crop up consistently, it is only natural to assume a common cause between propagated failures along the assembly line\cite{Ringsquandl2016-en} and it can be assumed that, after identifying such causes early on and mitigating them, propagation of failures might be reduced or entirely averted. The production line analyzed in this thesis had the goal to optimize their efficiency as well.
47
+
48
+Thus, the aim of this thesis will be to work on this problem using a graph mining based approach, starting with an overview of the literature (chapter \ref{chapter:related_work}) and the theoretical basis of graph mining (chapter \ref{chapter:theoretical_basis}), reflecting on how to extract a graph data structure from the data available as a log table (chapter \ref{section:data_set_splicing}), how best to mine the resulting graph or graphs for patterns (chapter \ref{section:gspan}), and eventually how to assess the resulting patterns (chapter \ref{subsection:evaluation}).
49
+
50
+\chapter{Related work}
51
+\label{chapter:related_work}
52
+Leveraging a graph mining based approach for mining log data constitutes a relatively novel use for most of the algorithms implemented in this work. Graph mining as a concept, has, up to this point, been mostly used on data that naturally lend themselves to a graph or network based data structure, such as social networks, modeling human relationship networks in general, chemical component analysis or link networks.\cite{Washio2003-fc}\cite{Han2007-qx} Meanwhile, mostly sequential pattern mining approaches have been leveraged against access or error log based data, such as the \textit{basket data} collected by most large retailers nowadays\cite{Agrawal1993-nc} or \textit{web access logs}\cite{Pei2000-rz}. The \textit{gSpan} algorithm was originally tested on synthetic graph data as well as for mining chemical compound data.\cite{Yan2002-sj}
53
+
54
+Most of the above mentioned naturally occurring graph data don't include time as a feature having measurable impact on the depicted relations. Chemical compounds may change and degrade over time, but their graph representations usually portray their idealized form. Social networks have also been of interest as dynamic processes themselves,\cite{Kossinets2006-rw} but these analyses tend to focus on snapshots of the full network after longer periods of time, whereas the data set examined in this work is a classical time series, with each row explicitly time stamped down to the second.
55
+
56
+\section{Sequential pattern mining}
57
+Sequential pattern analysis is a staple approach in time series mining. The term was first introduced and defined by Agrawal and Srikant\cite{Agrawal1994-ca} as follows: 
58
+\begin{quotation}
59
+	[G]iven a sequence database where each sequence is a list of transactions ordered by transaction time and each transaction consists of a set of items, find all sequential patterns with a user-specified minimum support, where the support is the number of data sequences that contain the pattern.
60
+\end{quotation}
61
+
62
+More formally, let $I = \{i_0, i_1, ..., i_n\}$ be a set of all items. Then a $k$-itemset $I^*$, which consists of $k$ items from $I$, is said to be \textit{frequent} if it occurs in a transaction database $D$ no less than $\theta|D|$ times, where $\theta$ is a user-specified \textit{minimum support threshold} (often abbreviated \textit{min\_sup}).
63
+
64
+Mining sequential patterns can take the form of a-priori algorithms (see subsection \ref{subsection:apriori}), like Srikant and Agrawal's GSP\cite{Srikant1996-dy} or pattern growth algorithms, like Yan and Han's \textit{gSpan}\cite{Yan2002-sj} used in this work.
65
+
66
+\section{Classical graph mining approaches}
67
+The classical graph mining approach often focuses on obtaining general structural information about a network. As an example, the mining of social networks often reveals an overall structure of almost-leaves (e.g. friend- or kinship groups) being connected via so called ''multiplicators`` or ''influencers`` -- nodes with a high degree centrality\cite{Newman2010-ac} that are also interconnected with each other, together acting as the central cluster in a basically star-shaped network, which offers an explanation for the small world problem\cite{Travers1967-cn}.
68
+
69
+Meanwhile, for this work, the overall structure of the graph (the dependency network between modules and inputs for the assembly line) is known beforehand, and even formally defined. Analysis of the design of the found substructures -- viz. \textit{what} the pattern symbolizes as opposed to \textit{finding} it in the first place -- is indeed only of interest after most of the work is already done.
70
+
71
+\subsection{A-priori based approach}
72
+\label{subsection:apriori}
73
+The A-priori based paradigm to frequent pattern mining is a heuristic that generates a reduced set of patterns through each iteration.\cite{Pei2000-rz} The a-priori principle, on which these approaches are based on, states that \textit{any super-pattern of an infrequent pattern cannot be frequent}.\cite{Han2004-qs} Thus, these algorithms first generate a set of all frequent 1-element sequences. From that, they generate new candidate sequences in a step-wise way. For example, if the patterns \textit{A} and \textit{B} are each frequent according to a specific \textit{min\_sup}, then the pattern \textit{AB} might be frequent as well. These generated patterns are then tested and discarded if and only if they don't reach the given \textit{min\_sup} (cf. algorithm \ref{alg:apriori}, where $T$ is the transaction database and $C_k$ is the candidate set for sequence length k).
74
+
75
+\begin{algorithm}
76
+	\caption[APriori($T,min\_sup$)]{APriori($T,min\_sup$)\cite{Agrawal1994-ca}}\label{alg:apriori}
77
+	\begin{algorithmic}[1]
78
+		\State {$L_1 \gets$ \{large 1-itemsets\};}
79
+		\State {$k \gets 2$}
80
+		\While {$L_{k-1} \neq \emptyset$}
81
+		\State {$C_k \gets \{ a \cup \{b\} \mid a \in L_{k-1} \land b \not \in a \} - \{ c \mid \{ s \mid s \subseteq c \land |s| = k-1 \} \nsubseteq L_{k-1} \}$}
82
+		\ForEach {$t \in T$}
83
+		\State {$C_t \gets \{ c \mid c \in C_k \land c \subseteq t \}$}
84
+		\ForEach {$c \in C_t$}
85
+		\State {$count[c] \gets count[c] + 1$}
86
+		\EndFor
87
+		\EndFor
88
+		\State {$L_k \gets \{c|c \in C_k \land count[c] \geq min\_sup\}$}
89
+		\State {$k \gets k+1$}
90
+		\EndWhile \\
91
+		\Return {$\bigcup_k L_k$;}
92
+	\end{algorithmic}
93
+\end{algorithm}
94
+
95
+Well-studied a-priori-based algorithms include the already mentioned GSP\cite{Srikant1996-dy}, SPADE\cite{Zaki2001-jy}, or HitSet\cite{Han1999-bj}. As a-priori-based approaches have to search through the input data at least once for every candidate pattern, the best a-priori algorithms achieve runtime efficiencies of $O(n^2)$.
96
+
97
+\subsection{Pattern-growth based approach}
98
+In contrast, the pattern-growth approach adopts a divide-and-conquer principle as follows: \textit{sequence databases are recursively projected into a set of smaller projected databases based on the current sequential pattern(s), and sequential patterns are grown in each projected database by exploring only locally frequent fragments}.\cite{Han2004-qs} The \textit{gSpan}-algorithm used in this work is, like its predecessors \textit{FreeSpan} and \textit{PrefixSpan}, such a pattern-growth based approach.
99
+
100
+\chapter{Theoretical basis}
101
+\label{chapter:theoretical_basis}
102
+The proposed event classification method mines a graph set $\mathcal{D} = (G_0, ..., G_n)$ for substructures of interest, often called ``patterns'' in the literature. To establish a firm theoretical understanding of what the these structures are, some definitions of graph theory are in order.
103
+
104
+\section{Graph}
105
+Traditionally, a graph is represented as a set of vertices $V$ and a set of edges $E$, with a mapping $f : E \rightarrow V \times V$ that defines each edge as a tuple of vertices. In directed graphs, this tuple is ordered. In undirected graphs, it is unordered, so that $\forall v_i, v_j: (v_i, v_j) \in E \rightarrow (v_j, v_i) \in E$. In this work, the vertex set $V$ of a graph $G$ may also be denoted $V(G)$. Likewise, the edge set may be denoted $E(G)$. To encode these structures in software, adjacency matrices and edge lists are commonly deployed. The following example adjacency matrix (table \ref{tab:example_adj_mat}) and visual representation (figure \ref{fig:example_graph}) all encode the same undirected graph $G = \left(\{v_1, v_2, v_3, v_4\}, \{(v_1, v_2), (v_2,v_3), (v_3, v_4), (v_4, v_2)\}\right)$.
106
+
107
+\begin{table*}
108
+	\caption{Adjacency matrix for graph $G$}
109
+	\label{tab:example_adj_mat}
110
+	\centering
111
+	\begin{tabular}{l|cccc}
112
+		      & $v_1$ & $v_2$ & $v_3$ & $v_4$ \\ \hline
113
+		$v_1$ &   0   &   1   &   0   &   0    \\
114
+		$v_2$ &   1   &   0   &   1   &   1   \\
115
+		$v_3$ &   0   &   1   &   0   &   1   \\
116
+		$v_4$ &   0   &   1   &   1   &   0
117
+	\end{tabular}
118
+\end{table*}
119
+
120
+\begin{figure}
121
+	\centering
122
+\begin{tikzpicture}[node distance = 2cm]
123
+	\tikzset{VertexStyle/.style = {
124
+			shape=circle,
125
+			draw=black
126
+	}}
127
+	\node[VertexStyle] (1){$v_1$};
128
+	\node[VertexStyle, right of= 1] (2){$v_2$};	
129
+	\node[VertexStyle, right of= 2] (3){$v_3$};
130
+	\node[VertexStyle, below of= 2, right of=2] (4){$v_4$};
131
+	
132
+	\path [-] (1) edge node[above] {} (2);
133
+	\path [-] (2) edge node[above] {} (3);
134
+	\path [-] (3) edge node[left] {} (4);
135
+	\path [-] (2) edge node[left] {} (4);
136
+	\end{tikzpicture}
137
+	\caption[Visual representation of graph $G$]{Visual representation of graph $G$}
138
+	\label{fig:example_graph}
139
+\end{figure}
140
+\section{Subgraph}
141
+Let $G_s$ and $G$ be two graphs, where $G_s = (V_s, E_s)$ and $V_s \subset V(G), E_s \subset E(G)$. Then, if the following holds:
142
+\[ \forall (v_i, v_j) \in E_s \implies v_i, v_j \in V_s, \]
143
+$G_s$ is said to be a subgraph of $G$. Figure \ref{fig:example_subgraph} illustrates an example subgraph of $G$ with three of $G$'s four vertices and two of its four edges. Note that the graphical representation need not be drawn in the same way to depict a subgraph.
144
+
145
+\begin{figure}[h]
146
+	\centering
147
+	\begin{tikzpicture}[node distance = 2cm]
148
+	\tikzset{VertexStyle/.style = {
149
+			shape=circle,
150
+			draw=black
151
+	}}
152
+	\node[VertexStyle, right of= 1] (2){$v_2$};	
153
+	\node[VertexStyle, right of= 2, below of=2] (3){$v_3$};
154
+	\node[VertexStyle, right of=3, above of=3] (4){$v_4$};
155
+	
156
+
157
+	\path [-] (2) edge node[above] {} (3);
158
+	\path [-] (3) edge node[left] {} (4);
159
+	\end{tikzpicture}
160
+	\caption{Example subgraph $G_s$ of $G$}
161
+	\label{fig:example_subgraph}
162
+\end{figure}
163
+
164
+
165
+\section{(Sub-)Graph isomorphism}
166
+Two graphs $G$ and $H$ are said to be isomorph when the following holds: let $f: V(G) \rightarrow V(H)$ be a bijection and let vertices $u$ and $v$ of $G$ be adjacent in $G$. Then $G \simeq H$ if and only if $f(u)$ and $f(v)$ are adjacent in $H$. It is currently unknown if the graph isomorphism problem is P or NP.\cite{Fortin1996-la}
167
+
168
+The subgraph isomorphism problem is the decision problem of whether, when given two graphs $G$ and $H$, there exists a subgraph in $G$ that is isomorphic to $H$. The subgraph isomorphism problem is known to be NP-complete.\cite{Cook1971-fj}.
169
+
170
+\chapter{Methodology}
171
+Especially considering time as both an explicit attribute of nodes and edges, as well as implicitly encoded into the graph structure and as such still present in subsequently mined subgraphs, constitutes an approach to exploring log data through graph mining that has not been examined extensively before.
172
+
173
+\section{Disregarded approaches}
174
+Before homing in on analyzing the given data set with \textit{gSpan} and a \textit{Support Vector Machine}, some approaches with lesser quality results were taken. Even though these approaches didn't prove fruitful in the long run, they helped me understand the data set and its underlying structure better and as such are, with their basic premises, included here to give a full account of all measures taken.
175
+
176
+\subsection{Shortest path algorithms on a single graph}
177
+\begin{figure}
178
+	\centering
179
+	\noindent\includegraphics[width=\linewidth]{images/fullgraph}
180
+	\caption{Naive single graph encoding all available information}
181
+	\label{fig:fullgraph}
182
+\end{figure}
183
+
184
+As the manufacturing process consists of a circuit of interconnected modules, the first approach of translating the given data into a graph form was to build one single large graph with all information available around this first circuit of module nodes, with the intention of later on mining frequent patterns from this single large graph, using algorithms such as Elseidy et al's \textit{GRAMI}\cite{Elseidy2014-fz} or Moussaoui et al's \textit{POSGRAMI}\cite{Moussaoui2016-ng}.
185
+
186
+To encode the log error data on top of this framework, the error messages were split into terms (essentially words) and each term made a node. Edges were drawn between all term-nodes in a given message, as well as between all term-nodes and the module-node they occurred in, as well as between all term-nodes and the production-unit-ID-node. This first naive approach of visualizing the available data produced a graph consisting of 523 nodes connected by 2,182 edges (illustrated in figure \ref{fig:fullgraph}). Edges were weighted simply by a count of how often they appeared throughout the whole data set, with some edges only appearing once and a maximum weight of over 10,000 appearances.
187
+
188
+This first graph did not, to a first approximation, retain its expected circuit-like appearance, instead clustering heavily around a few modules and production unit IDs that produced the most errors, meaning there were parts in the system vastly more error prone than others.
189
+
190
+In a second step, the Dijkstra shortest paths algorithm was used to find all paths between nodes that had a full path length of less than a specific constant, afterwards ordered by path length. For example, a short 4-edge path could connect two modules and the part ID via two error terms, indicating some kind of correlation, much like a sequential pattern analysis could find.
191
+
192
+For Dijkstra to work, the naive edge weight for edge $i$ ($w_{i_{naive}}$) had to be transformed from simple counts to a normalized form that would retain comparison operations between edges, but invert path lengths (as the Dijkstra algorithm is used, as its name suggests, to find the \emph{shortest paths}). This was achieved by the following transformation being calculated after graph generation, with~$w_{all}~=~\sum_{j=0}^{n} w_{j_{naive}}$:
193
+\[w_{i_{normalized}} = -\ln \left( \frac{w_{i_{naive}}}{w_{all}} \right) \]
194
+
195
+This weight was later penalized further by adding time constraints, so that
196
+\[w_{i_{penalized}} = w_{i_{normalized}} + P(i)\]
197
+
198
+where
199
+\[P(i) = \begin{cases}
200
+	\log_{\tilde{\Delta t}} \frac{\Delta t}{C(\Delta t) \cdot \tilde{\Delta t}}, & \text{if }C(\Delta t), \tilde{\Delta t} > 0 \\
201
+	0, & \text{otherwise}
202
+\end{cases}\]
203
+
204
+Here, $\Delta t$ is the amount of time between rows being connected by the edge, $C(\Delta t)$ is the amount of connections between these nodes, and $\tilde{\Delta t}$ is the average time differential between rows overall. This ensured penalization for long times between events (indicating that the events weren't correlated), while similarly rewarding events that happened more often in conjunction.
205
+
206
+\begin{figure}
207
+	\centering
208
+	\noindent\includegraphics[width=\linewidth]{images/example_pattern}
209
+	\caption{Large example pattern}
210
+	\label{fig:example_patterns}
211
+\end{figure}
212
+
213
+\begin{figure}
214
+	\centering
215
+	\noindent\includegraphics[width=\linewidth]{images/small_pattern}
216
+	\caption{Smaller example pattern}
217
+	\label{fig:small_pattern}
218
+\end{figure}
219
+
220
+\subsection{Digraph}
221
+Using a directed graph was briefly considered, but did, in some first tests, not lead to noticeably different patterns from the undirected graph. This was almost expected, as graph connections were highly predictable, because of the construction of the graphs similar to dependency graphs. 
222
+
223
+If, for example, in one splice, specific terms had a directed edge \textit{towards} a specific module, then this basic structure would obviously be repeated in other slices as well, by virtue of a construction that always pointed terms towards modules. The simple connectedness attribute that is an edge in an undirected graph already served this purpose implicitly when taking into account this background knowledge used during graph construction.
224
+
225
+\subsection{Natural language processing}
226
+The idea of combining both techniques of graph mining and a natural language processing unit to make sense of the actual contents of the error logs was briefly entertained as well. Good, easily available NLP software for languages other than English is still quite hard to come by, however. In addition, the analyzed error logs were often very short and technical, and thus didn't lend themselves to an actual content based analysis. Even to native speakers, many of the messages would've been quite cryptic and so it was decided that the processing power that would've been needed for an NLP based approach would be better used elsewhere.
227
+
228
+\section{Data set splicing}
229
+\label{section:data_set_splicing}
230
+The investigated data set consisted of about 57,000 messages logged over the course of five consecutive days in October 2016. Available columns were a time stamp (precision: 1s), an unstructured log message in German, the module ID where the message originated and the part ID of the produced item in that run. Messages sometimes included additional partly structured data, like a more detailed report of the location where the error occurred included in the German log message. See table \ref{table:dummy_messages} for an example of the data structure and a part of the synthetic data set used for the testing setup in section \ref{section:synthetic_data_results}.
231
+
232
+\begin{table}[]
233
+	\centering
234
+	\caption{Synthetic data set (excerpt)}
235
+	\label{table:dummy_messages}
236
+	\footnotesize 
237
+	\begin{tabular}{l|l|l|l}
238
+		time stamp & log message & module id & part id \\ \hline
239
+		2017-04-05 11:01:05 & Laser "uberhitzt            & Module 1 & 88495775TEST \\
240
+		2017-04-05 11:01:05 & Laser "uberhitzt            & Module 1 & 88495776TEST \\
241
+		2017-04-05 11:01:06 & Teil verkantet             & Module 2 & 88495776TEST \\
242
+		2017-04-05 11:01:06 & Laser "uberhitzt            & Module 1 & 88495776TEST \\
243
+		2017-04-05 11:01:10 & Laser "uberhitzt            & Module 1 & 88495776TEST \\
244
+		2017-04-05 11:01:12 & Auffangbeh"alter leeren     & Module 2 & 88495775TEST \\
245
+		2017-04-05 11:01:17 & Unbekannter Ausnahmefehler & Module 0 & 88495775TEST \\
246
+		2017-04-05 11:01:17 & Auffangbeh"alter leeren     & Module 2 & 88495775TEST \\
247
+		2017-04-05 11:01:19 & Unbekannter Ausnahmefehler & Module 0 & 88495775TEST \\
248
+		2017-04-05 11:05:22 & Laser "uberhitzt            & Module 1 & 88495775TEST \\
249
+		\multicolumn{1}{c}{\vdots} & \multicolumn{1}{c}{\vdots} & \multicolumn{1}{c}{\vdots} & \multicolumn{1}{c}{\vdots}
250
+	\end{tabular}
251
+\end{table}
252
+
253
+The data had to be cleaned up slightly prior to a first cursory visual analysis. An, at first glance, large amount of the total message count consisted of (without expert knowledge) seemingly meaningless general error messages consisting of only an error code and no further explanation. These specific messages existed in 4 slightly different formats of about 30 messages with the same time stamp each, with the error code incremented by one with each message. Each instance of these 30 message bursts was replaced with a simple ``general error'' message with the same time stamp. After this preprocessing step, the amount of messages to be considered had roughly halved.
254
+
255
+Further, if the module ID and manufacturing unit ID were included in the message in one of several standardized ways, they were extracted and given their own column in the data set to prohibit random integers cropping up in the messages to be mistaken for e.g. a module id.
256
+
257
+To generate the graph set to be mined, a week's log data was spliced along 5 minute time frames, producing graphs of the size like exemplified in figure \ref{fig:partial_graph}. Two different modules throwing errors in this specific time window can be easily distinguished. A 3 minute window was briefly considered, but the resulting graphs were, almost surprisingly, much smaller overall, so that the mined patterns weren't open to any meaningful interpretation and were fewer in number as well. Windows smaller than 5 minutes in general mostly led to many more small graphs, so that no long pattern could conceivably ever reach any useful \textit{min\_sup} threshold.
258
+
259
+\begin{figure}
260
+	\centering
261
+	\noindent\includegraphics[width=\linewidth]{images/partial_graph}
262
+	\caption{Graph of a 5-minute log data window}
263
+	\label{fig:partial_graph}
264
+\end{figure}
265
+
266
+\section{Basic graph construction}
267
+Extracting a basic graph structure from the input is both one of the least computationally intensive steps in the proposed methodology and the most important one. The analyzed data sets don't lend themselves to a natural scheme; background knowledge about the facility had to play an important part in determining the basic structure of the graphs.
268
+
269
+As previously mentioned, the constructed graphs heavily relied on known dependencies between error messages, modules and part IDs. Figure \ref{fig:example_slice} illustrates the common connections between the full error message \textit{Z}, its terms \textit{T}, the error logging module \textit{M} and the specific part ID \textit{S}. Error messages could be connected by using common words between them, through, e.g., a further narrowed down standardized localization term, while modules and part ids would be connected to all error messages and their terms produced with their involvement.
270
+
271
+\begin{figure}
272
+	\centering
273
+	\begin{tikzpicture}[node distance = 2cm]
274
+	\tikzset{VertexStyle/.style = {
275
+			shape=circle,
276
+			draw=black
277
+	}}
278
+	\node[VertexStyle] (1) {Z};
279
+	\node[VertexStyle, left of=1, above of=1] (2) {T};
280
+	\node[VertexStyle, right of=1, above of=1] (3) {T};
281
+	\node[VertexStyle, left of=1, below of=1] (4) {T};
282
+	\node[VertexStyle, right of=1, below of=1] (5) {T};
283
+	\node[VertexStyle, left of=2, below of=2] (6) {M};
284
+	\node[VertexStyle, right of=5, above of=5] (7) {S};
285
+	\node[right of=7, above of=7] (8) {$\dots$};
286
+	\node[left of=6, below of=6] (9) {$\dots$};
287
+	
288
+	\path [-] (1) edge node[left] {} (2);
289
+	\path [-] (1) edge node[left] {} (3);
290
+	\path [-] (1) edge node[left] {} (4);
291
+	\path [-] (1) edge node[left] {} (5);
292
+	\path [-] (3) edge node[left] {} (2);
293
+	\path [-] (4) edge node[left] {} (2);
294
+	\path [-] (5) edge node[left] {} (3);
295
+	\path [-] (5) edge node[left] {} (4);
296
+	\path [-] (2) edge node[left] {} (6);
297
+	\path [-] (3) edge node[left] {} (6);
298
+	\path [-] (4) edge node[left] {} (6);
299
+	\path [-] (5) edge node[left] {} (6);
300
+	\path [-] (2) edge node[left] {} (7);
301
+	\path [-] (3) edge node[left] {} (7);
302
+	\path [-] (4) edge node[left] {} (7);
303
+	\path [-] (5) edge node[left] {} (7);
304
+	\path [-] (7) edge node[left] {} (8);
305
+	\path [-] (6) edge node[left] {} (9);
306
+	\end{tikzpicture}
307
+	\caption[Example slice]{Example slice}
308
+	\label{fig:example_slice}
309
+\end{figure}
310
+
311
+\subsection{Using background knowledge during model construction}
312
+In the case of production facilities, the basic structure of the facility itself, even more so in a modularized system like the one on hand, has to be integrated into the basic graph scheme. Considering inherent parallelisms in the system (like two modules working in parallel), already places a few constraints on the resulting input graphs. Constraints were mostly input manually into the system prior to graph construction.
313
+
314
+\section{Features of interest}
315
+As mentioned before, the production facility doesn't run at peak performance most of the time. As such, of interest to this analysis was mostly if specific events or sequence of events would be able to predict a decrease in the OOE figure. A sequence analysis performed by my advisor some time before this thesis had already yielded some preliminary results in this direction which were considered known problems by the experts. Stumbling upon a pattern which would be considered a novel mechanical problem to solve would be the prime result of this work.
316
+
317
+\section{Modified gSpan}
318
+\label{section:gspan}
319
+\textit{gSpan} was first introduced by Yan and Han in 2002.\cite{Yan2002-sj}\cite{Yan2002-hg} \textit{gSpan} leverages depth-first search (DFS) to map graphs to minimum DFS codes, which are a canonical lexicographic graph labeling method. As all isomorphic graphs have the same canonical label, once computation of the labels is completed, it's trivially easy to solve the isomorphism question for any two graphs by comparing their canonical labels. If those labels are available in a lexicographic format, their comparison in itself is also trivially achievable by simple string comparison. The modification of \textit{gSpan} in this work is based on adding a hash function to make the lexicographic comparison faster.
320
+
321
+Introducing a hashing algorithm obviously introduces the risk of collisions between hashes, rendering the formerly unambiguous canonical label to graph mapping no longer bijective, but instead surjective. In the case of structured or semi-structured operator controlled inputs, however, the theoretical possibility of collisions because of the hashing shouldn't be a huge concern.
322
+
323
+
324
+\subsection{DFS Lexicographic Order}
325
+To demonstrate the construction of a minimum DFS code, we're again using the example graph from chapter \ref{chapter:theoretical_basis}, this time with additional labels for nodes and edges as visualized in figure \ref{fig:example_graph_dfs}.
326
+\begin{figure}
327
+	\centering
328
+	\begin{tikzpicture}[node distance = 2cm]
329
+	\tikzset{VertexStyle/.style = {
330
+		shape=circle,
331
+		draw=black
332
+	}}
333
+	\node[VertexStyle, label={[label distance=-.2cm]45:\small $v_1$}] (1){X};
334
+	\node[VertexStyle, right of= 1, label={[label distance=-.2cm]45:\small $v_2$}] (2){Y};	
335
+	\node[VertexStyle, right of= 2, label={[label distance=-.2cm]45:\small $v_3$}] (3){Z};
336
+	\node[VertexStyle, below of= 2, right of=2, label={[label distance=-.2cm]45:\small $v_4$}] (4){U};
337
+	
338
+	\path [-] (1) edge node[above] {a} (2);
339
+	\path [-] (2) edge node[above] {b} (3);
340
+	\path [-] (3) edge node[left] {c} (4);
341
+	\path [-] (2) edge node[left] {d} (4);
342
+	\end{tikzpicture}
343
+	\caption[Graph $G$ from chapter \ref{chapter:theoretical_basis} with labels]{Graph $G$ from chapter \ref{chapter:theoretical_basis} with labels}
344
+	\label{fig:example_graph_dfs}
345
+\end{figure}
346
+
347
+Mapping this graph to a minimum DFS code using algorithm \ref{alg:mindfscode} yields the minimum DFS code in table \ref{table:dummy_min_dfs_codes}.
348
+\begin{table}[h]
349
+	\centering
350
+	\caption{Minimum DFS code of graph $G$}
351
+	\label{table:dummy_min_dfs_codes}
352
+	\begin{tabular}{l|l}
353
+		edge no. & DFS code          \\ \hline
354
+		0        & $(0,2,U,d,Y)$     \\
355
+		1        & $(1, 2, X, a, Y)$ \\
356
+		2        & $(0, 3, U, c,Z)$  \\
357
+		3        & $(2, 3, Y, b, Z)$
358
+	\end{tabular}
359
+\end{table}
360
+
361
+\begin{algorithm}
362
+	\caption{MinDFSCode($G$)}\label{alg:mindfscode}
363
+	\begin{algorithmic}[1]
364
+		\State {initiate $S \gets \emptyset$}
365
+		\ForEach {vertex $v \in V(G)$}
366
+		\State {perform a depth-first search with $v$ as a starting point;}
367
+		\State {transform the resulting DFS tree $t$ into a DFS code tuple;}
368
+		\EndFor
369
+		\State {sort DFS code tuples by comparing their length according to DFS lexicographic order and choose the smallest one as the canonical label}
370
+	\end{algorithmic}
371
+\end{algorithm}
372
+The DFS lexicographic order is a linear order defined by the less or equal function in algorithm \ref{alg:dfs_lexicographic_order}. For the neighborhood restrictions and the comparison function between two DFS code tuples $a = (i_a, j_a, l_{i_a}, l_{(i_a, j_a)}, l_{j_a})$ and $b = (i_b, j_b, l_{i_b}, l_{(i_b, j_b)}, l_{j_b})$, please see algorithm \ref{alg:dfs_lexicographic_order_tuples}. In both algorithms, the following definitions apply: $\alpha = (a_0, a_1, ..., a_m)$ and $\beta = (b_0, b_1, ..., b_n)$, where each $a_t, b_t$ is a DFS code tuple of the form $x_t = (i_x, j_x, l_{i_x}, l_{(i_x, j_x)}, l_{j_x})$. $i_x, j_x$ are vertices, $l_{i_x}, l_{j_x}$ are their labels, and $l_{(i_x, j_x)}$ is the edge label.
373
+
374
+\begin{algorithm}
375
+	\caption{DFSLexicographicLE($\alpha, \beta$)}\label{alg:dfs_lexicographic_order}
376
+	\begin{algorithmic}[1]
377
+		\If {$n \geq m$ and $a_m = b_m$}
378
+			\State\Return {True, ie $\alpha \leq \beta$}
379
+		\Else
380
+			\State {$a_{\text{forward}} \gets \text{Bool}(j_a > i_a)$}
381
+			\State {$b_{\text{forward}} \gets \text{Bool}(j_b > i_b)$}
382
+			\State {$a_{\text{backward}} = \neg a_{\text{forward}}$}
383
+			\State {$b_{\text{backward}} = \neg b_{\text{forward}}$}
384
+			\If{$a_{\text{forward}} \land b_{\text{forward}}$}
385
+				\State\Return {True, ie $\alpha \leq \beta$}
386
+			\EndIf
387
+			\If{$a_{\text{backward}} \land b_{\text{backward}} \land j_a < j_b$}
388
+				\State\Return {True, ie $\alpha \leq \beta$}
389
+			\EndIf
390
+			\If{$a_{\text{backward}} \land b_{\text{backward}} \land j_a = j_b \land l_{(i_a, j_a)} < l_{(i_b, j_b)}$}
391
+				\State\Return {True, ie $\alpha \leq \beta$}
392
+			\EndIf
393
+			\If{$a_{\text{forward}} \land b_{\text{forward}} \land i_b < i_a$}
394
+				\State\Return {True, ie $\alpha \leq \beta$}
395
+			\EndIf
396
+			\If{$a_{\text{forward}} \land b_{\text{forward}} \land i_b = i_a \land l_{i_a} < l_{i_b}$}
397
+				\State\Return {True, ie $\alpha \leq \beta$}
398
+			\EndIf
399
+			\If{$a_{\text{forward}} \land b_{\text{forward}} \land i_b = i_a \land l_{i_a} = l_{i_b} \land l_{(i_a, j_a)} < l_{(i_b, j_b)}$}
400
+				\State\Return {True, ie $\alpha \leq \beta$}
401
+			\EndIf
402
+			\If{$a_{\text{forward}} \land b_{\text{forward}} \land i_b = i_a \land l_{i_a} = l_{i_b} \land l_{(i_a, j_a)} = l_{(i_b, j_b)} \land l_{j_a} < l_{j_b}$}
403
+				\State\Return {True, ie $\alpha \leq \beta$}
404
+			\EndIf
405
+			\State\Return {False, ie $\alpha > \beta$}
406
+		\EndIf
407
+	\end{algorithmic}
408
+\end{algorithm}
409
+
410
+\begin{algorithm}
411
+	\caption{DFSTuplesLexicographicLE($a, b$)}\label{alg:dfs_lexicographic_order_tuples}
412
+	\begin{algorithmic}[1]
413
+		\State {$a_{\text{forward}} \gets \text{Bool}(j_a > i_a)$}
414
+		\State {$b_{\text{forward}} \gets \text{Bool}(j_b > i_b)$}
415
+		\State {$a_{\text{backward}} = \neg a_{\text{forward}}$}
416
+		\State {$b_{\text{backward}} = \neg b_{\text{forward}}$}
417
+		\If{$a_{\text{forward}} \land b_{\text{forward}} \land a_j < b_j$}
418
+			\State\Return {True, ie $a \leq b$}
419
+		\EndIf
420
+		\If{$a_{\text{backward}} \land b_{\text{backward}} \land (a_i < b_i \lor (a_i = b_i \land a_j < b_j))$}
421
+			\State\Return {True, ie $a \leq b$}
422
+		\EndIf
423
+		\If{$a_{\text{backward}} \land b_{\text{forward}} \land a_i < b_j$}
424
+			\State\Return {True, ie $a \leq b$}
425
+		\EndIf
426
+		\If{$a_{\text{forward}} \land b_{\text{backward}} \land b_j \leq a_i$}
427
+			\State\Return {True, ie $a \leq b$}
428
+		\EndIf
429
+		\If{$a_{\text{backward}}$}
430
+			\If{$b_{\text{forward}} \land b_i \leq a_i \land b_j = a_i + 1$}
431
+				\State\Return {True, ie $a \leq b$}
432
+			\EndIf
433
+			\If{$b_{\text{backward}} \land b_i = a_i \land a_j < b_j$}
434
+				\State\Return {True, ie $a \leq b$}
435
+			\EndIf
436
+		\EndIf
437
+		\If{$a_{\text{forward}}$}
438
+			\If{$b_{\text{forward}} \land b_i \leq a_j \land b_j = a_j + 1$}
439
+				\State\Return {True, ie $a \leq b$}
440
+			\EndIf
441
+			\If{$b_{\text{backward}} \land b_i = a_j \land b_j < a_i$}
442
+				\State\Return {True, ie $a \leq b$}
443
+			\EndIf
444
+		\EndIf
445
+		\State\Return {False, ie $a > b$}
446
+		
447
+	\end{algorithmic}
448
+\end{algorithm}
449
+
450
+With these comparison algorithms in place, a \textit{DFS Code Tree} can be constructed. In a \textit{DFS Code Tree}, each node represents one graph via its DFS code. Obviously, in such a tree, the DFS code for a graph can turn up more than once, depending on node addition order. Thus, the first code that turns up on a pre-order depth-first search of the \textit{DFS Code Tree} is what we previously called the minimum DFS code.
451
+
452
+This results in the more formal definition given by \cite{Yan2002-hg}:
453
+\begin{quotation}
454
+	Given a graph $G$, $Z(G) = \{code(G, T) | \forall T, T \text{ is a DFS code}\}$, based on DFS lexicographic order, the minimum one, $\min(Z(G))$, is called \textbf{Minimum DFS Code} of $G$. It is also the canonical label of $G$.
455
+\end{quotation}
456
+
457
+\subsection{Graphset projection and subgraph mining}
458
+The pattern growth approach now becomes clearer when we're trying to construct a new pattern from an already found one: to construct a valid DFS code for the new pattern, the new edge cannot be added at an arbitrary position, but can only be added to vertices on the ''rightmost path.`` This is further limited, as only forward edges can grow from all vertices on the rightmost path, whereas backward edges can only be grown from the rightmost vertex.
459
+
460
+With these definitions in place, the \textit{gSpan} algorithm works as follows (see algorithm \ref{alg:graph_set_projection} for the pseudocode):
461
+
462
+\begin{algorithm}
463
+	\caption[GraphSet\_Projection($\mathcal{D,S}$)]{GraphSet\_Projection($\mathcal{D,S}$)\cite{Yan2002-hg}}
464
+	\label{alg:graph_set_projection}
465
+	\begin{algorithmic}[1]
466
+		\State sort labels of the vertices and edges in $\mathcal D$ by their frequency;
467
+		\State remove infrequent vertices and edges;
468
+		\State relabel the remaining vertices and edges in descending frequency;
469
+		\State $\mathcal S^1 \gets \text{all frequent 1-edge graphs in } \mathcal D$;
470
+		\State sort $\mathcal S^1$ in DFS lexicographic order;
471
+		\State $\mathcal S \gets \mathcal S^1$;
472
+		\ForEach {$\text{edge }e \in \mathcal S^1$}
473
+		\State $\text{initialize } s \text{with } e, \text{set } s.GS = \{g | \forall g \in \mathcal D, e \in E(g)\}$;
474
+		\State Subgraph\_Mining($\mathcal{D, S}, s$);
475
+		\State $\mathcal D \gets \mathcal D - e$;
476
+		\If {$|\mathcal D| < \textit{min\_sup}$}
477
+		\State \textbf{break};
478
+		\EndIf
479
+		\EndFor
480
+	\end{algorithmic}
481
+\end{algorithm}
482
+
483
+In a first step, infrequent single nodes and edges are removed from the search space, as there can be no longer patterns with infrequent substructures in them. The frequent one-edge subgraphs are stored in $\mathcal S^1$ and will be used as the seeds from which longer patterns are grown by calling algorithm \ref{alg:subgraph_mining} on all such one-edge patterns. Along the way, the graph set $\mathcal{D}$ is consecutively shrunk during each iteration, as previously searched patterns cannot turn up again later on. After finding all one-edge patterns and all their decedents, the algorithm terminates. For a definition of Enumerate(), see \cite{Yan2002-hg}.
484
+
485
+\begin{algorithm}
486
+	\caption[Subgraph\_Mining($\mathcal{D, S}, s$)]{Subgraph\_Mining($\mathcal{D, S}, s$)\cite{Yan2002-hg}}
487
+	\label{alg:subgraph_mining}
488
+	\begin{algorithmic}[1]
489
+		\If {$\textit{s} \neq \textit{min}(s)$}
490
+		\State \textbf{return};
491
+		\EndIf
492
+		\State $\mathcal S \gets \mathcal S \cup \{s\}$
493
+		\State generate all \textit{s'} potential children with one edge growth;
494
+		\State Enumerate(\textit{s});
495
+		\ForEach {$c, c \text{ is } s' \text{ child}$}
496
+		\If {$\textit{support}(c) \geq \textit{min\_sup}$}
497
+		\State $s \gets c;$
498
+		\State Subgraph\_Mining($\mathcal{D, S}, s$);
499
+		\EndIf
500
+		\EndFor
501
+	\end{algorithmic}
502
+\end{algorithm}
503
+
504
+\section{Support Vector Machine}
505
+An SVM\cite{Cortes1995-ix} is a supervised machine learning model that can classify a data set into distinct groups by constructing a hyperplane between their feature vectors that maximizes the distance between the nearest data points and the hyperplane for any class (the functional margin). Figure \ref{fig:hyperplane} illustrates this for a simple, two dimensional example with two intuitively distinct classes. The samples on the margin are called the support vectors. The SVM module from the Python package scikit-learn\cite{Pedregosa2011-ld} was used.
506
+\begin{figure}
507
+	\centering
508
+	\noindent\includegraphics[width=\linewidth]{images/sphx_glr_plot_separating_hyperplane_0011}
509
+	\caption[Functional margin between two classes of data points]{Functional margin between two classes of data points\cite{noauthor_undated-io}}
510
+	\label{fig:hyperplane}
511
+\end{figure}
512
+
513
+The original problem formulation for support vector classification is as follows:\cite{Chang2011-wa}
514
+
515
+Let $\boldsymbol{x}_i \in \mathbb{R}^p$ with $i=1,...,n$ and let $\boldsymbol{y} \in \mathbb{R}^l$ be an indicator vector, such that $\boldsymbol{y}_i \in \{1, -1\}$. Then SVC solves the following primal optimization problem:
516
+\[\min_ {w, b, \zeta} \frac{1}{2} w^T w + C \sum_{i=1}^{n} \zeta_i \]
517
+subject to
518
+\[ y_i (w^T \phi (x_i) + b) \geq 1 - \zeta_i,
519
+\zeta_i \geq 0, i=1, ..., n\]
520
+with its dual being
521
+\[\min_{\alpha} \frac{1}{2} \alpha^T Q \alpha - e^T \alpha\]
522
+subject to
523
+\[y^T \alpha = 0,
524
+0 \leq \alpha_i \leq C, i=1, ..., n\]
525
+where $\phi(\boldsymbol{x}_i)$ maps $\boldsymbol{x}_i$ into a higher-dimensional space, $e$ is the vector of all ones, $Q$ is an $x \times x$ positive semidefinite matrix with $Q_{ij} = y_i y_j K(x_i, x_j)$, with $K(x_i, x_j) = \phi(x_i)^T\phi(x_j)$ being the kernel, and $C > 0$ is the regularization parameter (upper bound).
526
+The decision function is given by:
527
+\[\operatorname{sgn}\left(\sum_{i=1}^n y_i \alpha_i K(x_i, x) + \rho\right).\]
528
+
529
+\section{OEE anomaly detection}
530
+Site performance is measured through OEE (Overall Equipment Effectiveness) scoring. OEE calculation yields a scoring between 0 and 1 according to the following formula:
531
+\[OEE = \frac{POK \cdot CT}{OT},\]
532
+where \textit{POK} is the number of \textbf{p}arts that came out of quality control \textbf{OK}, \textit{CT} is the \textbf{c}ycle \textbf{t}ime in seconds per part and \textit{OT} is the \textbf{o}perational \textbf{t}ime of the assembly line in seconds. All values are reset during shift changes, resulting in a short period of 0\% OEE before the first part of a new shift is produced. As an example, if the line ran for 3600 seconds, needed 10 seconds to produce a part and produced 300, the resulting OEE score would be
533
+\[OEE = \frac{300 \cdot 10}{3600}\approx .83. \]
534
+This result would indicate an assembly line running with about 83\% effectiveness. It should've produced 60 parts more in the given time, and, thus, was held up for some reason or another about 17\% of the time.
535
+
536
+OEE scores were available as a time series for the same time frame as the factory data set. As OEE scores were calculated every second, the resulting data set was considerably larger than the error logs, consisting of more than 800,000 rows of 26 columns, most of which weren't used. Anomaly detection consisted mainly of identifying more or less sudden drops in OEE scoring, indicating times when no parts were produced. The first few anomaly detection systems proved very capable of detecting shift changes and not much else, while later iterations did indeed pick up on most of the intuitively obvious drops.
537
+
538
+Algorithm \ref{alg:oeedetectanomalies} provides an efficient anomaly detection algorithm with $O(n)$ complexity, with $S$ being a set of slope indicators, $\tilde{S}$ the mean slope indication and $c$ a manually set parameter for how many standard deviations from the mean slope a anomaly should be assumed.
539
+
540
+\begin{algorithm}
541
+	\caption{OEEDetectAnomalies(OEE\_data)}\label{alg:oeedetectanomalies}
542
+	\begin{algorithmic}[1]
543
+		\State {$S \gets \emptyset$;}
544
+		\State {$R \gets \emptyset$;}
545
+		\ForEach {5 minute slice $\boldsymbol{s}$;}
546
+		\If {$\min_{OEE} \boldsymbol{s} \neq 0$;}
547
+		\State {$x \gets \frac{\max_{t} \boldsymbol{s}}{\min_{t} \boldsymbol{s}} -1 $;}
548
+		\State {$S \gets S \cup x$;}
549
+		\Else
550
+		\State {$S \gets S \cup 0$;}
551
+		\EndIf
552
+		\EndFor
553
+		\State {$l \gets \tilde{S} - c \cdot SD(S)$;}
554
+		\ForEach {$\hat{\boldsymbol{s}} = (x, y, z)$ in $S, z<l$;}
555
+		\State {$R \gets R \cup \hat{\boldsymbol{s}}$;}
556
+		\EndFor \\
557
+		\Return {$R$;}
558
+	\end{algorithmic}
559
+\end{algorithm}
560
+\chapter{Experiments and performance study}
561
+\section{Test setup}
562
+All tests were performed on a 2015 Lenovo Thinkpad T450s, with 12GB of RAM and an Intel Core i7-5600U clocked at 2.6 GHz, running a NixOS 17.09 with Python 3.6.4 built with GCC 6.4.0.
563
+\section{Results for a synthetic data set}
564
+\label{section:synthetic_data_results}
565
+The synthetic data set was generated by first simulating a random walk of OEE values from 11 am until about 9 pm, mimicking about one shift (figure \ref{fig:dummyrandomwalk}). Later, equivalent error logs were created, with some messages more likely to turn up at times when the generated random walk resulted in an OEE drop as recognized by the OEE anomalies detection.
566
+
567
+\begin{figure}
568
+	\centering
569
+	\includegraphics[width=1\linewidth]{images/dummy_random_walk}
570
+	\caption{Synthetic OEE values}
571
+	\label{fig:dummyrandomwalk}
572
+\end{figure}
573
+
574
+These generated data look similar to the real facility set, with various drops and and ascents. Resulting patterns like figure \ref{fig:dumm_pattern} also look very similar to real patterns like figure \ref{fig:small_pattern} above.
575
+
576
+\begin{figure}
577
+	\centering
578
+	\includegraphics[width=1\linewidth]{images/dummy_pattern}
579
+	\caption{8-edge pattern from the synthetic data set, min\_sup = .4}
580
+	\label{fig:dumm_pattern}
581
+\end{figure}
582
+
583
+The synthetic error log consisted of 1000 rows of errors, the OEE set of 35,919 rows (roughly 9 hours of second by second logs). Run times were very manageable for higher min\_sup values, but soon reached exponential (and thus, unsustainable) growth for min\_sup values much lower than .2 (cf. table \ref{table:runtimes_syn}).
584
+
585
+\begin{table}[]
586
+	\centering
587
+	\caption{Run times and patterns found (synthetic data set)}
588
+	\label{table:runtimes_syn}
589
+	\begin{tabular}{l|l|l}
590
+		data set                                      & \textit{t} & patterns \\ \hline
591
+		import errors and graph generation            & 1s         &          \\
592
+		import and anomalies detection on OEE         & 8s         &          \\ \hline
593
+		\textit{gSpan} (min\_sup = .7)                & 2s         & 40       \\
594
+		\textit{gSpan} (min\_sup = .6)                & 8s         & 106      \\
595
+		\textit{gSpan} (min\_sup = .5)                & 19s        & 241      \\
596
+		\textit{gSpan} (min\_sup = .4)                & 74s        & 1056     \\ \hline
597
+		SVM training and validation   (min\_sup = .7) & 4s         &          \\
598
+		SVM training and validation   (min\_sup = .6) & 8s         &          \\
599
+		SVM training and validation   (min\_sup = .5) & 35s        &          \\
600
+		SVM training and validation   (min\_sup = .4) & 13m 14s    &
601
+	\end{tabular}
602
+\end{table}
603
+
604
+\subsection{Evaluation with OEE data set}
605
+OEE anomalies were split into a training data set and an validation data set, with 80\% of the data being used for training and the remaining 20\% used for validation. For the min\_sup = .5 run, experimental mean slope $\tilde{S}$ (cf. algorithm \ref{alg:oeedetectanomalies}) was .5 with a standard deviation of .21 and a $c$-value of .1. The validation data set consisted of 49 time windows, 33 of which were deemed as a noticeable drop by the OEE evaluation algorithm. Of these 33, the SVM correctly identified 28 as drops, for a sensitivity score of 85\%. Of the remaining 19 non-drops, 5 were falsely identified as positives, for a specificity score of 74\%.
606
+
607
+For the min\_sup = .4 run, experimental mean slope $\tilde{S}$ was .5 with a standard deviation of .21 and a $c$-value of .1. The validation data set consisted of 49 time windows, 33 of which were deemed as a noticeable drop by the OEE evaluation algorithm. Of these 33, the SVM correctly identified 28 as drops, for a sensitivity score of 84.85\%. Of the remaining 19 non-drops, 5 were falsely identified as positives, for a specificity score of 73.68\%.
608
+\section{Results for facility data set}
609
+The facility data set consisted of 57,171 rows of error logs and 802,800 rows of OEE evaluation data. Results with min\_sup values of less than .5 could not be achieved. The algorithm consistently used up so much memory that it was OOM killed by the operating system after about a day of run time.
610
+
611
+\begin{table}
612
+	\centering
613
+	\caption{Run times and patterns found (facility data set)}
614
+	\label{table:runtimes_real} 
615
+	\begin{tabular}{l|l|l}
616
+		data set                                      & \textit{t}          & patterns \\ \hline
617
+		import errors and graph generation            & 50s                 &          \\
618
+		import and anomalies detection on OEE         & 2m 27s              &          \\ \hline
619
+		\textit{gSpan} (min\_sup = .9)                & 2m 20s              & 12       \\
620
+		\textit{gSpan} (min\_sup = .7)                & 6h 27m 12s          & 846      \\
621
+		\textit{gSpan} (min\_sup = .5)                & \textit{OOM killed} & --       \\ \hline
622
+		SVM training and validation   (min\_sup = .7) & 27s                 &
623
+	\end{tabular}
624
+\end{table}
625
+
626
+\subsection{Evaluation with OEE data set}
627
+\label{subsection:evaluation}
628
+OEE anomalies were split as above. Experimental mean slope $\tilde{S}$ (cf. algorithm \ref{alg:oeedetectanomalies}) was 1.1 with a standard deviation of .16 and a $c$-value of .1. The validation data set consisted of 486 time windows, 64 of which were deemed as a noticeable drop by the OEE evaluation algorithm. Of these 64, the SVM trained on patterns with a min\_sup of .7 correctly identified 60 as drops, for a sensitivity score of 93.75\%. Of the remaining 422 non-drops, 18 were identified as false positives, for a specificity score of 95.73\%.
629
+
630
+\chapter{Summary and Discussion}
631
+To conclude this bachelor's thesis, the following will summarize my findings with the acknowledgment that, although the essential research in this work was of my own design and execution, a project such as this is virtually impossible without guidance by an advisor and support by friends and family.
632
+
633
+With this work, I've introduced a method and provided a Python program to mine error log data for useful patterns, using a graph representation to take advantage of structural information and incorporate outside expert knowledge. I touched upon the most important concepts on which my model assumptions rest and expounded on some approaches that did not yield usable results.
634
+
635
+The proposed algorithm has been shown to produce patterns with adequate experimental time complexity, with synthetic data and proprietary Siemens facility data. The found patterns, to a first approximation, seem to provide real informational value and seem able to predict facility downtimes, as measured by a drop in OEE, all to a reasonable degree. A possible next step would be to show the patterns and the thoughts that went into the OEE anomaly detection to an expert with domain knowledge and then refine the proposed approaches through a few more iterations.
636
+
637
+The proposed approach has been shown to be somewhat fragile, in that at least some implementation details of \textit{gSpan}, the overall data structure used in this work, and maybe even the included libraries should be reevaluated at a later time, to hammer out possible errors and improve on the interaction between parts. 
638
+
639
+Further improvements to the algorithm, especially to improve on average-case time and memory performance, and allow it to directly process data streams instead of stale data would be much appreciated, but are sadly out of scope for this bachelor's thesis.
640
+
641
+The results that \textit{could} be reached, however, point in a promising direction. The overall approach -- leveraging a graph-based mining algorithm against a time series of event logs -- seems to have merit, not least of all because the resulting patterns can be visualized in a way that immediately makes a lot of sense to both the casual observer as well as the expert with intimate domain knowledge.
642
+
643
+This remains true even if event logs don't immediately spring to mind as being structurally similar to networks and as such means this approach needs further research and should at least be tried again with similarly non-obvious graph data in the future.
644
+
645
+\chapter*{Acknowledgments}
646
+\addcontentsline{toc}{chapter}{Acknowledgements}  
647
+I want to thank first and foremost my advisor, Martin Ringsquandl, as without his ever intelligent and on-point criticisms and ideas this bachelor's thesis wouldn't have been possible. Second, Prof. Dr. Kröger, for allowing this thesis as an external bachelor's thesis at the Munich Siemens AG headquarters. Third, Siemens Corporate Technology, and all the intelligent and lovely folks at the Research, Development and Automation/Business Analytics and Monitoring unit, who provided valuable input not only during lunch hours. I also wish to thank all my friends and family, who constantly bugged me about my progress especially during the later stages, especially Irina, Christina and my mother. Last, but certainly not least, I also want to thank my cat Tigris, who bugged me as well while I was writing, although he was mostly out for food.
648
+
649
+\sloppy{Furthermore, I am very thankful to live in a time with tools such as  \mbox{CytoScape}\cite{Shannon2003-gg},\mbox{TeXStudio}\cite{Van_der_Zander_undated-kf}, \mbox{TeXLive}\cite{Rahtz_undated-bv}, \mbox{IntelliJ PyCharm}, and \mbox{PaperPile} for making the development of my analytics software and the later write-up much easier and more efficient.}
650
+
651
+\listoffigures
652
+\listoftables
653
+\listof{algorithm}{List of Algorithms}
654
+\bibliographystyle{utcaps}
655
+\bibliography{bibliography}
656
+
657
+\end{document}

+ 1291
- 0
2018-02-16_efficient_event_classification_through_constrained_subgraph_mining/utcaps.bst
File diff suppressed because it is too large
View File