Эволюция семантики экономической терминологии |
||||
Диссертация | Автореферат | Материалы и инструменты | Что это? | |
|
Приложение II. Алгоритмы для работы с параллельными текстами(см. также: инструмент поиска синонимов) Алгоритм разделения текста на предложенияPrivate Sub BreakIntoSentences() Dim rsSource As Recordset, rsTarget As Recordset Dim sCurrentChunk As String Dim sSentence As String Dim iNextEnd As Integer Set rsSource = CurrentDb.OpenRecordset("SELECT * FROM [1895 source]") Set rsTarget = CurrentDb.OpenRecordset("1895 Sentences") While Not rsSource.EOF sCurrentChunk = rsSource![Text] While sCurrentChunk <> "" iNextEnd = InStr(sCurrentChunk, ". ") If InStr(sCurrentChunk, "! ") > 0 And InStr(sCurrentChunk, "! ") < iNextEnd Then iNextEnd = InStr(sCurrentChunk, "! ") If InStr(sCurrentChunk, "? ") > 0 And InStr(sCurrentChunk, "? ") < iNextEnd Then iNextEnd = InStr(sCurrentChunk, "? ") If iNextEnd > 0 Then sSentence = Left(sCurrentChunk, iNextEnd + 1) sCurrentChunk = Right(sCurrentChunk, Len(sCurrentChunk) - iNextEnd - 1) Else sSentence = sCurrentChunk sCurrentChunk = "" End If rsTarget.AddNew rsTarget![Sentence] = sSentence rsTarget.Update Wend rsSource.MoveNext Wend rsSource.Close rsTarget.Close End Sub Алгоритм разделения предложений на слова.Private Sub BreakIntoWords() Dim i As Integer Dim sCurrentChar As String Dim sSentence As String, sWord As String Dim rsSource As Recordset, rsTarget As Recordset Dim iWordNo As Integer Set rsSource = CurrentDb.OpenRecordset("SELECT * FROM [1895 Sentences]") Set rsTarget = CurrentDb.OpenRecordset("1895 Words")
While Not rsSource.EOF sSentence = rsSource![Sentence] For i = 1 To Len(sSentence) sCurrentChar = Mid(sSentence, i, 1) If sCurrentChar Like "[абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯіѣ]" Then sWord = sWord & sCurrentChar Else If sWord <> "" Then iWordNo = iWordNo + 1 rsTarget.AddNew rsTarget![Word] = sWord rsTarget![Sentence no] = rsSource![ID] rsTarget![Word no] = iWordNo rsTarget.Update sWord = "" End If End If Next i If sWord <> "" Then iWordNo = iWordNo + 1 rsTarget.AddNew rsTarget![Word] = sWord rsTarget![Sentence no] = rsSource![ID] rsTarget![Word no] = iWordNo rsTarget.Update sWord = "" End If rsSource.MoveNext iWordNo = 0 Wend End Sub Алгоритм оценки графической близости словоформFunction word_similarity(sWord1 As String, sWord2 As String) As Double Const BackFrame = 1 Const ForwardFrame = 1 Dim iCurrentW1Position As Integer Dim iCurrentW2Position As Integer Dim iLastMatchedW1Position As Integer Dim iLastMatchedW2Position As Integer Dim iMatchCount As Integer Dim iMisMatchCount As Integer Dim bMatchFound As Boolean Dim pd As Double
For iCurrentW1Position = 1 To Len(sWord1) bMatchFound = False For iCurrentW2Position = _ IIf(iLastMatchedW2Position + 1 > iCurrentW1Position - BackFrame, iLastMatchedW2Position + 1, iCurrentW1Position - BackFrame) _ To iCurrentW1Position + ForwardFrame If Mid(sWord1, iCurrentW1Position, 1) = Mid(sWord2, iCurrentW2Position, 1) Then bMatchFound = True iLastMatchedW2Position = iCurrentW2Position Exit For End If Next iCurrentW2Position If bMatchFound Then iMatchCount = iMatchCount + 1 Else iMisMatchCount = iMisMatchCount + 1 End If Next iCurrentW1Position
For iCurrentW2Position = 1 To Len(sWord2) bMatchFound = False For iCurrentW1Position = _ IIf(iLastMatchedW1Position + 1 > iCurrentW2Position - BackFrame, iLastMatchedW1Position + 1, iCurrentW2Position - BackFrame) _ To iCurrentW2Position + ForwardFrame If Mid(sWord2, iCurrentW2Position, 1) = Mid(sWord1, iCurrentW1Position, 1) Then bMatchFound = True iLastMatchedW1Position = iCurrentW1Position Exit For End If Next iCurrentW1Position If bMatchFound Then iMatchCount = iMatchCount + 1 Else iMisMatchCount = iMisMatchCount + 1 End If Next iCurrentW2Position
word_similarity = iMatchCount / (iMisMatchCount + iMatchCount) End Function
Алгоритмы выравнивания предложенийType alignment x1 As String y1 As String x2 As String y2 As String x1_deleted As String y1_deleted As String d As Long End Type
Private Sub Command1_Click()
Me.MousePointer = vbHourglass dao.DBEngine.SystemDB = "system.mdw" Set ws = dao.DBEngine.CreateWorkspace("test", "Admin", "") Set db = ws.OpenDatabase(InputBox("Database location:"))
Dim lRecords1895 As Long, lRecords1931 As Long Dim a() As alignment Dim s1() As String, s2() As String Dim ls1() As Integer, ls2() As Integer Dim n As Long Dim rs1 As Recordset, rs2 As Recordset, rs_target As Recordset Dim i As Long
lRecords1895 = txtToRecord1895.Text - txtFromRecord1895.Text + 1 lRecords1931 = txtToRecord1931.Text - txtFromRecord1931.Text + 1 Set rs1 = db.OpenRecordset("SELECT [1895 sentences explicitly split].[Sentence] AS [Sentence], [1895 sentences].[Sentence] AS [Text], Len([1895 sentences].[Sentence]) AS [Length] FROM [1895 sentences explicitly split] INNER JOIN [1895 sentences] ON [1895 sentences explicitly split].[Sentence No]= [1895 sentences].[ID]") rs1.MoveLast ReDim s1(0 To lRecords1895 - 1) '(rs1.RecordCount - 1) ReDim ls1(0 To lRecords1895 - 1) rs1.MoveFirst rs1.Move txtFromRecord1895.Text - 1 For i = 0 To UBound(s1) s1(i) = rs1![Sentence] ls1(i) = rs1![Length] rs1.MoveNext Next i Set rs2 = db.OpenRecordset("SELECT [1931 sentences explicitly split].[Sentence] AS [Sentence], [1931 sentences].[Sentence] AS [Text], Len([1931 sentences].[Sentence]) AS [Length] FROM [1931 sentences explicitly split] INNER JOIN [1931 sentences] ON [1931 sentences explicitly split].[Sentence No]= [1931 sentences].[ID]") rs2.MoveLast ReDim s2(0 To lRecords1931 - 1) '(rs2.RecordCount - 1) ReDim ls2(0 To lRecords1931 - 1) rs2.MoveFirst rs2.Move txtFromRecord1931.Text - 1 For i = 0 To UBound(s2) s2(i) = rs2![Sentence] ls2(i) = rs2![Length] rs2.MoveNext Next i n = seq_align(s1, s2, ls1, ls2, UBound(s1), UBound(s2), a) 'n = seq_align(len1, len2, UBound(len1), UBound(len2), a) rs1.MoveFirst rs2.MoveFirst Set rs_target = db.OpenRecordset("Aligned") For i = 0 To n - 1 rs_target.AddNew If a(i).x1 <> "" Then rs1.FindFirst ("[Sentence]='" & a(i).x1 & "'") If rs1.NoMatch Then rs_target![Text EN] = "(Not found!)" Else rs_target![Text EN] = rs1![Text] End If If a(i).x2 <> "" Then rs1.FindFirst ("[Sentence]='" & a(i).x2 & "'") If rs1.NoMatch Then rs_target![Text EN] = rs_target![Text EN] & "(Not found!)" Else rs_target![Text EN] = rs_target![Text EN] & rs1![Text] End If rs_target![EN Merged] = True End If End If If a(i).y1 <> "" Then rs2.FindFirst ("[Sentence]='" & a(i).y1 & "'") If rs2.NoMatch Then rs_target![Text RU] = "(Not found!)" Else rs_target![Text RU] = rs2![Text] End If If a(i).y2 <> "" Then rs2.FindFirst ("[Sentence]='" & a(i).y2 & "'") If rs2.NoMatch Then rs_target![Text RU] = rs_target![Text RU] & "(Not found!)" Else rs_target![Text RU] = rs_target![Text RU] & rs2![Text] End If rs_target![RU Merged] = True End If Else 'Debug.Print "<None RU>" End If rs_target![d] = a(i).d rs_target.Update Next i rs_target.Close rs1.Close rs2.Close db.Close ws.Close Me.MousePointer = vbDefault MsgBox n End Sub
Function pnorm(z As Double) As Double 'Returns the area under a normal ditribution 'from -inf to z standard deviations ' 'From W. Gale., K. Church 1990 ' Dim t As Double t = 1 / (1 + 0.2316419 * z) pnorm = 1 - 0.3989423 * _ Exp(-z * z / 2) * _ ((((1.330274429 * t - 1.821255978) * t _ + 1.781477937) * t - 0.356563782) * t + 0.31938153) * t End Function
Function two_side_distance(x1 As String, y1 As String, x2 As String, y2 As String, lx1 As Integer, ly1 As Integer, lx2 As Integer, ly2 As Integer) As Long ' 'From W. Gale., K. Church 1990 ' Const penalty21 As Long = 230 ' -100 * log((prob of 2-1 match) / (prob of 1-1 match)) Const penalty22 As Long = 1000 ' -100 * log((prob of 2-2 match) / (prob of 1-1 match)) Const penalty01 As Long = 250 ' -100 * log((prob of 0-1 match) / (prob of 1-1 match)) If lx2 = 0 And ly2 = 0 Then If lx1 = 0 Then ' insertion two_side_distance = penalty01 ' sentence_match(x1, y1) +penalty01 ElseIf ly1 = 0 Then ' deletion two_side_distance = penalty01 ' sentence_match(x1, y1) +penalty01 Else two_side_distance = sentence_match(x1, y1, lx1, ly1) ' substitution End If ElseIf lx2 = 0 Then ' expansion two_side_distance = sentence_match(x1, y1 & y2, lx1, ly1 + ly2) + penalty21 ElseIf ly2 = 0 Then ' contraction two_side_distance = sentence_match(x1 & x2, y1, lx1 + lx2, ly1) + penalty21 Else ' merger two_side_distance = sentence_match(x1 & x2, y1 & y2, lx1 + lx2, ly1 + ly2) + penalty22 End If End Function
Function seq_align(x() As String, y() As String, lx() As Integer, ly() As Integer, nx As Long, ny As Long, Align() As alignment) As Long 'seq_align by Mike Riley 'x and y are sequences of objects, represented as non-zero ints, to be aligned. 'dist_funct(x1, y1, x2, y2) is a distance function of 4 args: 'dist_funct(x1, y1, 0, 0) gives cost of substitution of x1 by y1. 'dist_funct(x1, 0, 0, 0) gives cost of deletion of x1. 'dist_funct(0, y1, 0, 0) gives cost of insertion of y1. 'dist_funct(x1, y1, x2, 0) gives cost of contraction of (x1,x2) to y1. 'dist_funct(x1, y1, 0, y2) gives cost of expansion of x1 to (y1,y2). 'dist_funct(x1, y1, x2, y2) gives cost to match (x1,x2) to (y1,y2). 'align is the alignment, with (align(i).x1, align(i).x2) aligned 'with (align(i).y1, align(i).y2). Zero in align().x1 and align().y1 'correspond to insertion and deletion, respectively. Non-zero in 'align().x2 and align().y2 correspond to contraction and expansion, 'respectively. align().d gives the distance for that pairing. 'The function returns the length of the alignment.
Const MAXINT As Long = 2147483647 '32767
'Dim ah As New align_helper Dim pathx() As Long, pathy() As Long, dist() As Long, n As Long Dim i As Long, j As Long, oi As Long, oj As Long, di As Long, dj As Long, d1 As Long, d2 As Long, d3 As Long, d4 As Long, d5 As Long, d6 As Long, DMin As Long Dim ralign() As alignment 'Init nx, ny ReDim dist((nx + 1), (ny + 1)) ReDim pathx((nx + 1), (ny + 1)) ReDim pathy((nx + 1), (ny + 1)) ReDim ralign(nx + ny) For j = 0 To ny For i = 0 To nx ' substitution If i > 0 And j > 0 Then d1 = dist(i - 1, j - 1) + two_side_distance(x(i - 1), y(j - 1), "", "", lx(i - 1), ly(j - 1), 0, 0) 'OK Else d1 = MAXINT End If ' deletion If i > 0 Then d2 = dist(i - 1, j) + two_side_distance(x(i - 1), "", "", "", lx(i - 1), 0, 0, 0) 'OK Else d2 = MAXINT End If ' insertion If j > 0 Then d3 = dist(i, j - 1) + two_side_distance("", y(j - 1), "", "", 0, ly(j - 1), 0, 0) 'OK Else d3 = MAXINT End If ' contraction If i > 1 And j > 0 Then d4 = dist(i - 2, j - 1) + two_side_distance(x(i - 1), y(j - 1), x(i - 2), "", lx(i - 1), ly(j - 1), lx(i - 2), 0) '(x(i - 1), y(j - 1), x(i - 2), "") '(x(i - 2), y(j - 1), x(i - 1), 0) Else d4 = MAXINT End If 'expansion If i > 0 And j > 1 Then d5 = dist(i - 1, j - 2) + two_side_distance(x(i - 1), y(j - 2), "", y(j - 1), lx(i - 1), ly(j - 2), 0, ly(j - 1)) '(x(i - 1), y(j - 2), "", y(j - 1)) '(x(i - 1), y(j - 1), 0, y(j - 2)) - mine Else d5 = MAXINT End If ' melding If i > 1 And j > 1 Then d6 = dist(i - 2, j - 2) + two_side_distance(x(i - 2), y(j - 2), x(i - 1), y(j - 1), lx(i - 2), ly(j - 2), lx(i - 1), ly(j - 1)) '(x(i - 2), y(j - 2), x(i - 1), y(j - 1)) '(x(i - 1), y(j - 1), x(i - 2), y(j - 2)) - mine Else d6 = MAXINT End If DMin = d1 If d2 < DMin Then DMin = d2 If d3 < DMin Then DMin = d3 If d4 < DMin Then DMin = d4 If d5 < DMin Then DMin = d5 If d6 < DMin Then DMin = d6 If DMin = MAXINT Then dist(i, j) = 0 ElseIf DMin = d1 Then dist(i, j) = d1 pathx(i, j) = i - 1 pathy(i, j) = j - 1 ElseIf DMin = d2 Then dist(i, j) = d2 pathx(i, j) = i - 1 pathy(i, j) = j ElseIf DMin = d3 Then dist(i, j) = d3 pathx(i, j) = i pathy(i, j) = j - 1 ElseIf DMin = d4 Then dist(i, j) = d4 pathx(i, j) = i - 2 pathy(i, j) = j - 1 ElseIf DMin = d5 Then dist(i, j) = d5 pathx(i, j) = i - 1 pathy(i, j) = j - 2 Else ' dmin = d6 dist(i, j) = d6 pathx(i, j) = i - 2 pathy(i, j) = j - 2 End If '''''''''''''''''''''' 'Debug.Print "i=" & i & ", j=" & j 'Debug.Print "pathx:" 'PrintArray pathx 'Debug.Print "pathy:" 'PrintArray pathy 'Debug.Print "dist:" 'PrintArray dist '''''''''''''''''''''' Next i 'Debug.Print j Main.txtStatus.Text = j DoEvents Next j n = 0 i = nx j = ny While i > 0 Or j > 0 oi = pathx(i, j) oj = pathy(i, j) di = i - oi dj = j - oj If di = 1 And dj = 1 Then ' substitution ralign(n).x1 = x(i - 1) ralign(n).y1 = y(j - 1) ralign(n).x2 = "" ralign(n).y2 = "" ralign(n).d = dist(i, j) - dist(i - 1, j - 1) n = n + 1 ElseIf di = 1 And dj = 0 Then 'deletion ralign(n).x1 = x(i - 1) ralign(n).y1 = "" ralign(n).x2 = "" ralign(n).y2 = "" ralign(n).d = dist(i, j) - dist(i - 1, j) ralign(n).y1_deleted = y(i - 1) n = n + 1 ElseIf di = 0 And dj = 1 Then ' insertion ralign(n).x1 = "" ralign(n).y1 = y(j - 1) ralign(n).x2 = "" ralign(n).y2 = "" ralign(n).d = dist(i, j) - dist(i, j - 1) n = n + 1 ElseIf dj = 1 Then ' contraction ralign(n).x1 = x(i - 2) ralign(n).y1 = y(j - 1) ralign(n).x2 = x(i - 1) ralign(n).y2 = "" ralign(n).d = dist(i, j) - dist(i - 2, j - 1) n = n + 1 ElseIf di = 1 Then ' expansion ralign(n).x1 = x(i - 1) ralign(n).y1 = y(j - 2) ralign(n).x2 = "" ralign(n).y2 = y(j - 1) ralign(n).d = dist(i, j) - dist(i - 1, j - 2) n = n + 1 Else ' di = 2 And dj =2 ( melding) ralign(n).x1 = x(i - 2) ralign(n).y1 = y(j - 2) ralign(n).x2 = x(i - 1) ralign(n).y2 = y(j - 1) ralign(n).d = dist(i, j) - dist(i - 2, j - 2) n = n + 1 End If i = oi j = oj Wend ReDim Align(n) For i = 0 To n - 1 Align(n - i - 1) = ralign(i) Next i seq_align = n End Function
Function sentence_similarity(sSentence1 As String, sSentence2 As String) As Double 'v.2 Static Similarities() As Byte Dim cS1 As Collection Dim cS2 As Collection Dim iCurrentS1Position As Integer Dim iCurrentS2Position As Integer Dim iMatchCount As Integer Dim iMisMatchCount As Integer Dim bMatchFound As Boolean Static rsSimilarities As Recordset If rsSimilarities Is Nothing Then Debug.Print "sentence_similarity initialising..." Set rsSimilarities = db.OpenRecordset("Similarities", dbOpenSnapshot) ReDim Similarities(1 To DMax("[Word 1895 ID]", "Similarities"), 1 To DMax("[Word 1931 ID]", "Similarities")) While Not rsSimilarities.EOF Similarities(rsSimilarities![Word 1895 ID], rsSimilarities![Word 1931 ID]) = 1 rsSimilarities.MoveNext Wend Debug.Print "done." End If 'Convert chr(27)-delimited sentences into collections Set cS1 = SentenceToCollection(sSentence1) Set cS2 = SentenceToCollection(sSentence2) ' For iCurrentS1Position = 1 To cS1.Count bMatchFound = False For iCurrentS2Position = 1 To cS2.Count If cS1(iCurrentS1Position) <= UBound(Similarities, 1) And cS2(iCurrentS2Position) <= UBound(Similarities, 2) Then If Similarities(cS1(iCurrentS1Position), cS2(iCurrentS2Position)) = 1 Then bMatchFound = True Exit For End If End If Next iCurrentS2Position If bMatchFound Then iMatchCount = iMatchCount + 1 Else iMisMatchCount = iMisMatchCount + 1 End If Next iCurrentS1Position
For iCurrentS2Position = 1 To cS2.Count bMatchFound = False For iCurrentS1Position = 1 To cS1.Count If cS1(iCurrentS1Position) <= UBound(Similarities, 1) And cS2(iCurrentS2Position) <= UBound(Similarities, 2) Then If Similarities(cS1(iCurrentS1Position), cS2(iCurrentS2Position)) = 1 Then bMatchFound = True Exit For End If End If Next iCurrentS1Position If bMatchFound Then iMatchCount = iMatchCount + 1 Else iMisMatchCount = iMisMatchCount + 1 End If Next iCurrentS2Position
sentence_similarity = iMatchCount / (iMisMatchCount + iMatchCount) End Function
Function sentence_match(sSentence1 As String, sSentence2 As String, len1 As Integer, len2 As Integer) As Double Const BIG_DISTANCE = 2500 'Return -100 * log probability that an English sentence of length 'len1 is a translation of a foreign sentence of length len2. The 'probability is based on two parameters, the mean and variance of 'number of foreign characters per English character. ' 'From W. Gale., K. Church 1990 ' Dim z_length As Double, pd_length As Double, mean_length As Double Dim word_match_ratio As Double, pd_similarity As Double Dim pd As Double Const c_length As Double = 0.92 Const s2_length As Double = 6.8 Const length_weight = 0.2 Const bad_pd_similarity_under = 0.2 Const good_pd_length_over = 0.8 If len1 = 0 And len2 = 0 Then Exit Function 'return 0
'Calculate the match probability based on word matches pd_similarity = sentence_similarity(sSentence1, sSentence2)
If pd_similarity < bad_pd_similarity_under Then 'Calculate the match probability based on lentgh mean_length = (len1 + len2 / c_length) / 2 z_length = (c_length * len1 - len2) / Sqr(s2_length * mean_length) 'Need to deal with both sides of the normal distribution If z_length < 0 Then z_length = -z_length pd_length = 2 * (1 - pnorm(z_length)) If pd_length > good_pd_length_over Then pd = pd_length * length_weight Else pd = pd_similarity End If
If pd > 0 Then sentence_match = Int(-100 * Log(pd)) Else sentence_match = BIG_DISTANCE End If
End Function
|
|||
© Б.Н. Рахимбердиев, 2003. |