forked from tom--/Collation-to-Charset-Table
-
Notifications
You must be signed in to change notification settings - Fork 0
/
collation_2_charset_table-1.php
155 lines (143 loc) · 4.65 KB
/
collation_2_charset_table-1.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
<?php
/*
* USER CONFIGURATIONS
*/
/**
* Generate Sphinx charset folding rules for these Unicode character ranges.
*
* You may want to manually add a some terminal ranges to Sphinx's charset_table. For
* example, with the $ranges in this example, I want to cover all "reasonable" alphabets
* in the range 0000-FFFF. But utf8_general_ci doesn't do anything to most of that range.
* So it's easier to use this script on a portion of the range and enter the rest as
* terminals in charset_table manually, e.g.:
*
* U+590..109F, U+1100..167F, U+1700..U+1DFF, U+2C60..U+2DFF, \
* U+2E80..U+2FFF, U+3040..U+DFFF, U+F900..U+FBFF, U+FE70..U+FFEF, \
*
* @var string[] Array of hex ranges as strings
*/
$ranges = array(
'0000-02AF',
'0370-058F',
'10A0-10FF',
'1E00-1FFF',
);
/**
* @var string Name of the working database to use for processing.
*/
$dbname = 'my_collation_db';
/**
* @var string Name of the working table to use for processing.
*/
$tablename = 'my_table';
/**
* @var string Collation to use (utf8 only)
*/
$collation = 'utf8_general_ci';
#$collation = 'utf8_spanish_ci';
#$collation = 'utf8_swedish_ci';
/**
* @var string PDO DSN
*/
$pdo_dsn = 'mysql:unix_socket=/tmp/mysql.sock';
/**
* @var string MySQL user name
*/
$pdo_user = 'root';
/**
* @var string MySQL password
*/
$pdo_pass = prompt_silent('MySQL password for use: ');
/*
* END OF USER CONFIGURATIONS
* ============================================================================
*/
ini_set('default_charset', 'UTF-8');
$db = new PDO($pdo_dsn, $pdo_user, $pdo_pass);
$db->query("SET NAMES 'utf8' COLLATE '$collation';");
$db->query("DROP DATABASE IF EXISTS `$dbname`;");
$db->query("CREATE DATABASE `$dbname` DEFAULT CHARACTER SET utf8 COLLATE $collation;");
$db->query("USE $dbname;");
$db->query(
"CREATE TABLE IF NOT EXISTS `$tablename` (
`dec` int(11) NOT NULL,
`mychar` char(1) CHARACTER SET utf8 COLLATE $collation NOT NULL,
`hex` char(4) NOT NULL,
PRIMARY KEY (`dec`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;"
);
// The SQL insert clauses without data
$ins = "INSERT IGNORE INTO `$tablename` (`dec`, `mychar`, `hex`) VALUES ";
// Get the max MySQL packet size
$max_data_per_packet = $db->query("SELECT @@session.max_allowed_packet;")->fetch();
// Figure the max number of data bytes per MySQL packet
$max_data_per_packet = $max_data_per_packet[0] - 2 * mb_strlen($ins, 'ISO-8859-1');
$rows_per_insert = $max_data_per_packet / 70;
// Add a row to the working table for every Unicode char in the ranges specified
$s = array();
foreach ($ranges as $range) {
if (preg_match('/^([0-9A-F]{1,6})-([0-9A-F]{1,6})$/i',
$range, $m)
) {
for ($i = "0x{$m[1]}"; $i <= "0x{$m[2]}"; ++$i) {
$hex = sprintf('%04x', $i);
$s[] = "($i, CAST(_ucs2 x'$hex' AS CHAR CHARACTER SET utf8), '$hex')";
if (count($s) >= $rows_per_insert) {
$db->query($e = $ins . implode(',', $s) . ";");
$s = array();
}
}
}
}
if ($s) {
$db->query($e = $ins . implode(',', $s) . ";");
}
// Now the interesting bit. Use mysql's GROUP BY to group rows of characters
// according to the collation. Use GROUP_CONCAT to get each set of chars the
// collation considers equivalent as:
// x: a comma separated list of utf8 characters
// y: a comma separated list of hex unicode codepoints
$r = $db->query(
"SELECT GROUP_CONCAT(`mychar` ORDER BY `dec` ASC SEPARATOR ',') AS x,
GROUP_CONCAT(`hex` ORDER BY `dec` ASC SEPARATOR ',') AS y
FROM $tablename GROUP BY `mychar`;"
);
// For each grouped set, write to stdout each column x and y as two comma-
// separated lists with a tab in between
if ($r) {
foreach ($r as $row) {
print($row['x'] . "\t" . $row['y'] . "\n");
}
}
/**
* Interactively prompts for input without echoing to the terminal.
* Requires a bash shell or Windows and won't work with
* safe_mode settings (Uses `shell_exec`)
* @param string @prompt password entry prompt
* @return mixed|string the interactively entered password
*/
function prompt_silent($prompt = "Enter Password:") {
if (preg_match('/^win/i', PHP_OS)) {
$vbscript = sys_get_temp_dir() . 'prompt_password.vbs';
file_put_contents(
$vbscript, 'wscript.echo(InputBox("'
. addslashes($prompt)
. '", "", "password here"))');
$command = "cscript //nologo " . escapeshellarg($vbscript);
$password = rtrim(shell_exec($command));
unlink($vbscript);
return $password;
} else {
$command = "/usr/bin/env bash -c 'echo OK'";
if (rtrim(shell_exec($command)) !== 'OK') {
trigger_error("Can't invoke bash");
return false;
}
$command = "/usr/bin/env bash -c 'read -s -p \""
. addslashes($prompt)
. "\" mypassword && echo \$mypassword'";
$password = rtrim(shell_exec($command));
echo "\n";
return $password;
}
}