Эволюция семантики экономической терминологии

Приложение II. Алгоритмы для работы с параллельными текстами

(см. также: инструмент поиска синонимов)

Алгоритм разделения текста на предложения

Private Sub BreakIntoSentences()

Dim rsSource As Recordset, rsTarget As Recordset

Dim sCurrentChunk As String

Dim sSentence As String

Dim iNextEnd As Integer

Set rsSource = CurrentDb.OpenRecordset("SELECT * FROM [1895 source]")

Set rsTarget = CurrentDb.OpenRecordset("1895 Sentences")

While Not rsSource.EOF

 sCurrentChunk = rsSource![Text]

 While sCurrentChunk <> ""

 iNextEnd = InStr(sCurrentChunk, ". ")

 If InStr(sCurrentChunk, "! ") > 0 And InStr(sCurrentChunk, "! ") < iNextEnd Then iNextEnd = InStr(sCurrentChunk, "! ")

 If InStr(sCurrentChunk, "? ") > 0 And InStr(sCurrentChunk, "? ") < iNextEnd Then iNextEnd = InStr(sCurrentChunk, "? ")

 If iNextEnd > 0 Then

 sSentence = Left(sCurrentChunk, iNextEnd + 1)

 sCurrentChunk = Right(sCurrentChunk, Len(sCurrentChunk) - iNextEnd - 1)

 Else

 sSentence = sCurrentChunk

 sCurrentChunk = ""

 End If

 rsTarget.AddNew

 rsTarget![Sentence] = sSentence

 rsTarget.Update

 Wend

 rsSource.MoveNext

Wend

rsSource.Close

rsTarget.Close

End Sub

Алгоритм разделения предложений на слова.

Private Sub BreakIntoWords()

Dim i As Integer

Dim sCurrentChar As String

Dim sSentence As String, sWord As String

Dim rsSource As Recordset, rsTarget As Recordset

Dim iWordNo As Integer

Set rsSource = CurrentDb.OpenRecordset("SELECT * FROM [1895 Sentences]")

Set rsTarget = CurrentDb.OpenRecordset("1895 Words")

 

While Not rsSource.EOF

 sSentence = rsSource![Sentence]

 For i = 1 To Len(sSentence)

 sCurrentChar = Mid(sSentence, i, 1)

 If sCurrentChar Like "[абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯіѣ]" Then

 sWord = sWord & sCurrentChar

 Else

 If sWord <> "" Then

 iWordNo = iWordNo + 1

 rsTarget.AddNew

 rsTarget![Word] = sWord

 rsTarget![Sentence no] = rsSource![ID]

 rsTarget![Word no] = iWordNo

 rsTarget.Update

 sWord = ""

 End If

 End If

 Next i

 If sWord <> "" Then

 iWordNo = iWordNo + 1

 rsTarget.AddNew

 rsTarget![Word] = sWord

 rsTarget![Sentence no] = rsSource![ID]

 rsTarget![Word no] = iWordNo

 rsTarget.Update

 sWord = ""

 End If

 rsSource.MoveNext

 iWordNo = 0

Wend

End Sub

Алгоритм оценки графической близости словоформ

Function word_similarity(sWord1 As String, sWord2 As String) As Double

Const BackFrame = 1

Const ForwardFrame = 1

Dim iCurrentW1Position As Integer

Dim iCurrentW2Position As Integer

Dim iLastMatchedW1Position As Integer

Dim iLastMatchedW2Position As Integer

Dim iMatchCount As Integer

Dim iMisMatchCount As Integer

Dim bMatchFound As Boolean

Dim pd As Double

 

For iCurrentW1Position = 1 To Len(sWord1)

 bMatchFound = False

 For iCurrentW2Position = _

 IIf(iLastMatchedW2Position + 1 > iCurrentW1Position - BackFrame, iLastMatchedW2Position + 1, iCurrentW1Position - BackFrame) _

 To iCurrentW1Position + ForwardFrame

 If Mid(sWord1, iCurrentW1Position, 1) = Mid(sWord2, iCurrentW2Position, 1) Then

 bMatchFound = True

 iLastMatchedW2Position = iCurrentW2Position

 Exit For

 End If

 Next iCurrentW2Position

 If bMatchFound Then

 iMatchCount = iMatchCount + 1

 Else

 iMisMatchCount = iMisMatchCount + 1

 End If

Next iCurrentW1Position

 

For iCurrentW2Position = 1 To Len(sWord2)

 bMatchFound = False

 For iCurrentW1Position = _

 IIf(iLastMatchedW1Position + 1 > iCurrentW2Position - BackFrame, iLastMatchedW1Position + 1, iCurrentW2Position - BackFrame) _

 To iCurrentW2Position + ForwardFrame

 If Mid(sWord2, iCurrentW2Position, 1) = Mid(sWord1, iCurrentW1Position, 1) Then

 bMatchFound = True

 iLastMatchedW1Position = iCurrentW1Position

 Exit For

 End If

 Next iCurrentW1Position

 If bMatchFound Then

 iMatchCount = iMatchCount + 1

 Else

 iMisMatchCount = iMisMatchCount + 1

 End If

Next iCurrentW2Position

 

word_similarity = iMatchCount / (iMisMatchCount + iMatchCount)

End Function

 

Алгоритмы выравнивания предложений

Type alignment

 x1 As String

 y1 As String

 x2 As String

 y2 As String

 x1_deleted As String

 y1_deleted As String

 d As Long

End Type

 

Private Sub Command1_Click()

 

Me.MousePointer = vbHourglass

dao.DBEngine.SystemDB = "system.mdw"

Set ws = dao.DBEngine.CreateWorkspace("test", "Admin", "")

Set db = ws.OpenDatabase(InputBox("Database location:"))

 

Dim lRecords1895 As Long, lRecords1931 As Long

Dim a() As alignment

Dim s1() As String, s2() As String

Dim ls1() As Integer, ls2() As Integer

Dim n As Long

Dim rs1 As Recordset, rs2 As Recordset, rs_target As Recordset

Dim i As Long

 

lRecords1895 = txtToRecord1895.Text - txtFromRecord1895.Text + 1

lRecords1931 = txtToRecord1931.Text - txtFromRecord1931.Text + 1

Set rs1 = db.OpenRecordset("SELECT [1895 sentences explicitly split].[Sentence] AS [Sentence], [1895 sentences].[Sentence] AS [Text], Len([1895 sentences].[Sentence]) AS [Length] FROM [1895 sentences explicitly split] INNER JOIN [1895 sentences] ON [1895 sentences explicitly split].[Sentence No]= [1895 sentences].[ID]")

rs1.MoveLast

ReDim s1(0 To lRecords1895 - 1) '(rs1.RecordCount - 1)

ReDim ls1(0 To lRecords1895 - 1)

rs1.MoveFirst

rs1.Move txtFromRecord1895.Text - 1

For i = 0 To UBound(s1)

 s1(i) = rs1![Sentence]

 ls1(i) = rs1![Length]

 rs1.MoveNext

Next i

Set rs2 = db.OpenRecordset("SELECT [1931 sentences explicitly split].[Sentence] AS [Sentence], [1931 sentences].[Sentence] AS [Text], Len([1931 sentences].[Sentence]) AS [Length] FROM [1931 sentences explicitly split] INNER JOIN [1931 sentences] ON [1931 sentences explicitly split].[Sentence No]= [1931 sentences].[ID]")

rs2.MoveLast

ReDim s2(0 To lRecords1931 - 1) '(rs2.RecordCount - 1)

ReDim ls2(0 To lRecords1931 - 1)

rs2.MoveFirst

rs2.Move txtFromRecord1931.Text - 1

For i = 0 To UBound(s2)

 s2(i) = rs2![Sentence]

 ls2(i) = rs2![Length]

 rs2.MoveNext

Next i

n = seq_align(s1, s2, ls1, ls2, UBound(s1), UBound(s2), a) 'n = seq_align(len1, len2, UBound(len1), UBound(len2), a)

rs1.MoveFirst

rs2.MoveFirst

Set rs_target = db.OpenRecordset("Aligned")

For i = 0 To n - 1

 rs_target.AddNew

 If a(i).x1 <> "" Then

 rs1.FindFirst ("[Sentence]='" & a(i).x1 & "'")

 If rs1.NoMatch Then

 rs_target![Text EN] = "(Not found!)"

 Else

 rs_target![Text EN] = rs1![Text]

 End If

 If a(i).x2 <> "" Then

 rs1.FindFirst ("[Sentence]='" & a(i).x2 & "'")

 If rs1.NoMatch Then

 rs_target![Text EN] = rs_target![Text EN] & "(Not found!)"

 Else

 rs_target![Text EN] = rs_target![Text EN] & rs1![Text]

 End If

 rs_target![EN Merged] = True

 End If

 End If

 If a(i).y1 <> "" Then

 rs2.FindFirst ("[Sentence]='" & a(i).y1 & "'")

 If rs2.NoMatch Then

 rs_target![Text RU] = "(Not found!)"

 Else

 rs_target![Text RU] = rs2![Text]

 End If

 If a(i).y2 <> "" Then

 rs2.FindFirst ("[Sentence]='" & a(i).y2 & "'")

 If rs2.NoMatch Then

 rs_target![Text RU] = rs_target![Text RU] & "(Not found!)"

 Else

 rs_target![Text RU] = rs_target![Text RU] & rs2![Text]

 End If

 rs_target![RU Merged] = True

 End If

 Else

 'Debug.Print "<None RU>"

 End If

 rs_target![d] = a(i).d

 rs_target.Update

Next i

rs_target.Close

rs1.Close

rs2.Close

db.Close

ws.Close

Me.MousePointer = vbDefault

MsgBox n

End Sub

 

Function pnorm(z As Double) As Double

'Returns the area under a normal ditribution

'from -inf to z standard deviations

'

'From W. Gale., K. Church 1990

'

Dim t As Double

t = 1 / (1 + 0.2316419 * z)

pnorm = 1 - 0.3989423 * _

 Exp(-z * z / 2) * _

 ((((1.330274429 * t - 1.821255978) * t _

 + 1.781477937) * t - 0.356563782) * t + 0.31938153) * t

End Function

 

Function two_side_distance(x1 As String, y1 As String, x2 As String, y2 As String, lx1 As Integer, ly1 As Integer, lx2 As Integer, ly2 As Integer) As Long

'

'From W. Gale., K. Church 1990

'

Const penalty21 As Long = 230 ' -100 * log((prob of 2-1 match) / (prob of 1-1 match))

Const penalty22 As Long = 1000 ' -100 * log((prob of 2-2 match) / (prob of 1-1 match))

Const penalty01 As Long = 250 ' -100 * log((prob of 0-1 match) / (prob of 1-1 match))

If lx2 = 0 And ly2 = 0 Then

 If lx1 = 0 Then ' insertion

 two_side_distance = penalty01 ' sentence_match(x1, y1) +penalty01

 ElseIf ly1 = 0 Then ' deletion

 two_side_distance = penalty01 ' sentence_match(x1, y1) +penalty01

 Else

 two_side_distance = sentence_match(x1, y1, lx1, ly1) ' substitution

 End If

ElseIf lx2 = 0 Then ' expansion

 two_side_distance = sentence_match(x1, y1 & y2, lx1, ly1 + ly2) + penalty21

ElseIf ly2 = 0 Then ' contraction

 two_side_distance = sentence_match(x1 & x2, y1, lx1 + lx2, ly1) + penalty21

Else ' merger

 two_side_distance = sentence_match(x1 & x2, y1 & y2, lx1 + lx2, ly1 + ly2) + penalty22

End If

End Function

 

Function seq_align(x() As String, y() As String, lx() As Integer, ly() As Integer, nx As Long, ny As Long, Align() As alignment) As Long

'seq_align by Mike Riley

'x and y are sequences of objects, represented as non-zero ints, to be aligned.

'dist_funct(x1, y1, x2, y2) is a distance function of 4 args:

'dist_funct(x1, y1, 0, 0) gives cost of substitution of x1 by y1.

'dist_funct(x1, 0, 0, 0) gives cost of deletion of x1.

'dist_funct(0, y1, 0, 0) gives cost of insertion of y1.

'dist_funct(x1, y1, x2, 0) gives cost of contraction of (x1,x2) to y1.

'dist_funct(x1, y1, 0, y2) gives cost of expansion of x1 to (y1,y2).

'dist_funct(x1, y1, x2, y2) gives cost to match (x1,x2) to (y1,y2).

'align is the alignment, with (align(i).x1, align(i).x2) aligned

'with (align(i).y1, align(i).y2). Zero in align().x1 and align().y1

'correspond to insertion and deletion, respectively. Non-zero in

'align().x2 and align().y2 correspond to contraction and expansion,

'respectively. align().d gives the distance for that pairing.

'The function returns the length of the alignment.

 

Const MAXINT As Long = 2147483647 '32767

 

'Dim ah As New align_helper

Dim pathx() As Long, pathy() As Long, dist() As Long, n As Long

Dim i As Long, j As Long, oi As Long, oj As Long, di As Long, dj As Long, d1 As Long, d2 As Long, d3 As Long, d4 As Long, d5 As Long, d6 As Long, DMin As Long

Dim ralign() As alignment

'Init nx, ny

ReDim dist((nx + 1), (ny + 1))

ReDim pathx((nx + 1), (ny + 1))

ReDim pathy((nx + 1), (ny + 1))

ReDim ralign(nx + ny)

For j = 0 To ny

 For i = 0 To nx

 ' substitution

 If i > 0 And j > 0 Then

 d1 = dist(i - 1, j - 1) + two_side_distance(x(i - 1), y(j - 1), "", "", lx(i - 1), ly(j - 1), 0, 0) 'OK

 Else

 d1 = MAXINT

 End If

 ' deletion

 If i > 0 Then

 d2 = dist(i - 1, j) + two_side_distance(x(i - 1), "", "", "", lx(i - 1), 0, 0, 0) 'OK

 Else

 d2 = MAXINT

 End If

 ' insertion

 If j > 0 Then

 d3 = dist(i, j - 1) + two_side_distance("", y(j - 1), "", "", 0, ly(j - 1), 0, 0) 'OK

 Else

 d3 = MAXINT

 End If

 ' contraction

 If i > 1 And j > 0 Then

 d4 = dist(i - 2, j - 1) + two_side_distance(x(i - 1), y(j - 1), x(i - 2), "", lx(i - 1), ly(j - 1), lx(i - 2), 0) '(x(i - 1), y(j - 1), x(i - 2), "") '(x(i - 2), y(j - 1), x(i - 1), 0)

 Else

 d4 = MAXINT

 End If

 'expansion

 If i > 0 And j > 1 Then

 d5 = dist(i - 1, j - 2) + two_side_distance(x(i - 1), y(j - 2), "", y(j - 1), lx(i - 1), ly(j - 2), 0, ly(j - 1)) '(x(i - 1), y(j - 2), "", y(j - 1)) '(x(i - 1), y(j - 1), 0, y(j - 2)) - mine

 Else

 d5 = MAXINT

 End If

 ' melding

 If i > 1 And j > 1 Then

 d6 = dist(i - 2, j - 2) + two_side_distance(x(i - 2), y(j - 2), x(i - 1), y(j - 1), lx(i - 2), ly(j - 2), lx(i - 1), ly(j - 1)) '(x(i - 2), y(j - 2), x(i - 1), y(j - 1)) '(x(i - 1), y(j - 1), x(i - 2), y(j - 2)) - mine

 Else

 d6 = MAXINT

 End If

 DMin = d1

 If d2 < DMin Then DMin = d2

 If d3 < DMin Then DMin = d3

 If d4 < DMin Then DMin = d4

 If d5 < DMin Then DMin = d5

 If d6 < DMin Then DMin = d6

 If DMin = MAXINT Then

 dist(i, j) = 0

 ElseIf DMin = d1 Then

 dist(i, j) = d1

 pathx(i, j) = i - 1

 pathy(i, j) = j - 1

 ElseIf DMin = d2 Then

 dist(i, j) = d2

 pathx(i, j) = i - 1

 pathy(i, j) = j

 ElseIf DMin = d3 Then

 dist(i, j) = d3

 pathx(i, j) = i

 pathy(i, j) = j - 1

 ElseIf DMin = d4 Then

 dist(i, j) = d4

 pathx(i, j) = i - 2

 pathy(i, j) = j - 1

 ElseIf DMin = d5 Then

 dist(i, j) = d5

 pathx(i, j) = i - 1

 pathy(i, j) = j - 2

 Else ' dmin = d6

 dist(i, j) = d6

 pathx(i, j) = i - 2

 pathy(i, j) = j - 2

 End If

 ''''''''''''''''''''''

 'Debug.Print "i=" & i & ", j=" & j

 'Debug.Print "pathx:"

 'PrintArray pathx

 'Debug.Print "pathy:"

 'PrintArray pathy

 'Debug.Print "dist:"

 'PrintArray dist

 ''''''''''''''''''''''

 Next i

 'Debug.Print j

 Main.txtStatus.Text = j

 DoEvents

Next j

n = 0

i = nx

j = ny

While i > 0 Or j > 0

 oi = pathx(i, j)

 oj = pathy(i, j)

 di = i - oi

 dj = j - oj

 If di = 1 And dj = 1 Then ' substitution

 ralign(n).x1 = x(i - 1)

 ralign(n).y1 = y(j - 1)

 ralign(n).x2 = ""

 ralign(n).y2 = ""

 ralign(n).d = dist(i, j) - dist(i - 1, j - 1)

 n = n + 1

 ElseIf di = 1 And dj = 0 Then 'deletion

 ralign(n).x1 = x(i - 1)

 ralign(n).y1 = ""

 ralign(n).x2 = ""

 ralign(n).y2 = ""

 ralign(n).d = dist(i, j) - dist(i - 1, j)

 ralign(n).y1_deleted = y(i - 1)

 n = n + 1

 ElseIf di = 0 And dj = 1 Then ' insertion

 ralign(n).x1 = ""

 ralign(n).y1 = y(j - 1)

 ralign(n).x2 = ""

 ralign(n).y2 = ""

 ralign(n).d = dist(i, j) - dist(i, j - 1)

 n = n + 1

 ElseIf dj = 1 Then ' contraction

 ralign(n).x1 = x(i - 2)

 ralign(n).y1 = y(j - 1)

 ralign(n).x2 = x(i - 1)

 ralign(n).y2 = ""

 ralign(n).d = dist(i, j) - dist(i - 2, j - 1)

 n = n + 1

 ElseIf di = 1 Then ' expansion

 ralign(n).x1 = x(i - 1)

 ralign(n).y1 = y(j - 2)

 ralign(n).x2 = ""

 ralign(n).y2 = y(j - 1)

 ralign(n).d = dist(i, j) - dist(i - 1, j - 2)

 n = n + 1

 Else ' di = 2 And dj =2 ( melding)

 ralign(n).x1 = x(i - 2)

 ralign(n).y1 = y(j - 2)

 ralign(n).x2 = x(i - 1)

 ralign(n).y2 = y(j - 1)

 ralign(n).d = dist(i, j) - dist(i - 2, j - 2)

 n = n + 1

 End If

 i = oi

 j = oj

Wend

ReDim Align(n)

For i = 0 To n - 1

 Align(n - i - 1) = ralign(i)

Next i

seq_align = n

End Function

 

Function sentence_similarity(sSentence1 As String, sSentence2 As String) As Double

'v.2

Static Similarities() As Byte

Dim cS1 As Collection

Dim cS2 As Collection

Dim iCurrentS1Position As Integer

Dim iCurrentS2Position As Integer

Dim iMatchCount As Integer

Dim iMisMatchCount As Integer

Dim bMatchFound As Boolean

Static rsSimilarities As Recordset

If rsSimilarities Is Nothing Then

 Debug.Print "sentence_similarity initialising..."

 Set rsSimilarities = db.OpenRecordset("Similarities", dbOpenSnapshot)

 ReDim Similarities(1 To DMax("[Word 1895 ID]", "Similarities"), 1 To DMax("[Word 1931 ID]", "Similarities"))

 While Not rsSimilarities.EOF

 Similarities(rsSimilarities![Word 1895 ID], rsSimilarities![Word 1931 ID]) = 1

 rsSimilarities.MoveNext

 Wend

 Debug.Print "done."

End If

'Convert chr(27)-delimited sentences into collections

Set cS1 = SentenceToCollection(sSentence1)

Set cS2 = SentenceToCollection(sSentence2)

'

For iCurrentS1Position = 1 To cS1.Count

 bMatchFound = False

 For iCurrentS2Position = 1 To cS2.Count

 If cS1(iCurrentS1Position) <= UBound(Similarities, 1) And cS2(iCurrentS2Position) <= UBound(Similarities, 2) Then

 If Similarities(cS1(iCurrentS1Position), cS2(iCurrentS2Position)) = 1 Then

 bMatchFound = True

 Exit For

 End If

 End If

 Next iCurrentS2Position

 If bMatchFound Then

 iMatchCount = iMatchCount + 1

 Else

 iMisMatchCount = iMisMatchCount + 1

 End If

Next iCurrentS1Position

 

For iCurrentS2Position = 1 To cS2.Count

 bMatchFound = False

 For iCurrentS1Position = 1 To cS1.Count

 If cS1(iCurrentS1Position) <= UBound(Similarities, 1) And cS2(iCurrentS2Position) <= UBound(Similarities, 2) Then

 If Similarities(cS1(iCurrentS1Position), cS2(iCurrentS2Position)) = 1 Then

 bMatchFound = True

 Exit For

 End If

 End If

 Next iCurrentS1Position

 If bMatchFound Then

 iMatchCount = iMatchCount + 1

 Else

 iMisMatchCount = iMisMatchCount + 1

 End If

Next iCurrentS2Position

 

sentence_similarity = iMatchCount / (iMisMatchCount + iMatchCount)

End Function

 

Function sentence_match(sSentence1 As String, sSentence2 As String, len1 As Integer, len2 As Integer) As Double

Const BIG_DISTANCE = 2500

'Return -100 * log probability that an English sentence of length

'len1 is a translation of a foreign sentence of length len2. The

'probability is based on two parameters, the mean and variance of

'number of foreign characters per English character.

'

'From W. Gale., K. Church 1990

'

Dim z_length As Double, pd_length As Double, mean_length As Double

Dim word_match_ratio As Double, pd_similarity As Double

Dim pd As Double

Const c_length As Double = 0.92

Const s2_length As Double = 6.8

Const length_weight = 0.2

Const bad_pd_similarity_under = 0.2

Const good_pd_length_over = 0.8

If len1 = 0 And len2 = 0 Then Exit Function 'return 0

 

'Calculate the match probability based on word matches

pd_similarity = sentence_similarity(sSentence1, sSentence2)

 

If pd_similarity < bad_pd_similarity_under Then

 'Calculate the match probability based on lentgh

 mean_length = (len1 + len2 / c_length) / 2

 z_length = (c_length * len1 - len2) / Sqr(s2_length * mean_length)

 'Need to deal with both sides of the normal distribution

 If z_length < 0 Then z_length = -z_length

 pd_length = 2 * (1 - pnorm(z_length))

 If pd_length > good_pd_length_over Then pd = pd_length * length_weight

Else

 pd = pd_similarity

End If

 

If pd > 0 Then

 sentence_match = Int(-100 * Log(pd))

Else

 sentence_match = BIG_DISTANCE

End If

 

End Function

 

© Б.Н. Рахимбердиев, 2003.

Сайт управляется системой uCoz