Text.php
4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
<?php
namespace Faker\Provider;
abstract class Text extends Base
{
protected static $baseText = '';
protected static $separator = ' ';
protected static $separatorLen = 1;
protected $explodedText;
protected $consecutiveWords = array();
protected static $textStartsWithUppercase = true;
/**
* Generate a text string by the Markov chain algorithm.
*
* Depending on the $maxNbChars, returns a random valid looking text. The algorithm
* generates a weighted table with the specified number of words as the index and the
* possible following words as the value.
*
* @example 'Alice, swallowing down her flamingo, and began by taking the little golden key'
* @param integer $maxNbChars Maximum number of characters the text should contain (minimum: 10)
* @param integer $indexSize Determines how many words are considered for the generation of the next word.
* The minimum is 1, and it produces the higher level of randomness, although the
* generated text usually doesn't make sense. Higher index sizes (up to 5)
* produce more correct text, at the price of less randomness.
* @return string
*/
public function realText($maxNbChars = 200, $indexSize = 2)
{
if ($maxNbChars < 10) {
throw new \InvalidArgumentException('maxNbChars must be at least 10');
}
if ($indexSize < 1) {
throw new \InvalidArgumentException('indexSize must be at least 1');
}
if ($indexSize > 5) {
throw new \InvalidArgumentException('indexSize must be at most 5');
}
$words = $this->getConsecutiveWords($indexSize);
$result = array();
$resultLength = 0;
// take a random starting point
$next = static::randomKey($words);
while ($resultLength < $maxNbChars && isset($words[$next])) {
// fetch a random word to append
$word = static::randomElement($words[$next]);
// calculate next index
$currentWords = static::explode($next);
$currentWords[] = $word;
array_shift($currentWords);
$next = static::implode($currentWords);
// ensure text starts with an uppercase letter
if ($resultLength == 0 && !static::validStart($word)) {
continue;
}
// append the element
$result[] = $word;
$resultLength += static::strlen($word) + static::$separatorLen;
}
// remove the element that caused the text to overflow
array_pop($result);
// build result
$result = static::implode($result);
return static::appendEnd($result);
}
protected function getConsecutiveWords($indexSize)
{
if (!isset($this->consecutiveWords[$indexSize])) {
$parts = $this->getExplodedText();
$words = array();
$index = array();
for ($i = 0; $i < $indexSize; $i++) {
$index[] = array_shift($parts);
}
for ($i = 0, $count = count($parts); $i < $count; $i++) {
$stringIndex = static::implode($index);
if (!isset($words[$stringIndex])) {
$words[$stringIndex] = array();
}
$word = $parts[$i];
$words[$stringIndex][] = $word;
array_shift($index);
$index[] = $word;
}
// cache look up words for performance
$this->consecutiveWords[$indexSize] = $words;
}
return $this->consecutiveWords[$indexSize];
}
protected function getExplodedText()
{
if ($this->explodedText === null) {
$this->explodedText = static::explode(preg_replace('/\s+/u', ' ', static::$baseText));
}
return $this->explodedText;
}
protected static function explode($text)
{
return explode(static::$separator, $text);
}
protected static function implode($words)
{
return implode(static::$separator, $words);
}
protected static function strlen($text)
{
return function_exists('mb_strlen') ? mb_strlen($text, 'UTF-8') : strlen($text);
}
protected static function validStart($word)
{
$isValid = true;
if (self::$textStartsWithUppercase) {
$isValid = preg_match('/^\p{Lu}/u', $word);
}
return $isValid;
}
protected static function appendEnd($text)
{
return $text.'.';
}
}